PHP Code Highlighter
Posted: August 05, 2007
I don't like the way the built in function to colorize PHP code works, so I wrote my own PHP code highlighter.
I don't plan to write an explanation for how this works. If you are curious, just look at the code. The string handling can be a little confusing sometimes because of all the special cases, but the rest is pretty simple.
<?php
// this is not defined in my version of php
if (! defined('T_METHOD')) define('T_METHOD', 361);
define('TOKEN_NORMAL', 0);
define('TOKEN_WHITESPACE', 1);
define('TOKEN_COMMENT', 2);
define('TOKEN_OPERATOR', 3);
define('TOKEN_CAST', 4);
define('TOKEN_KEYWORD', 5);
define('TOKEN_PHPTAG', 6);
define('TOKEN_STRING', 7);
define('TOKEN_INTEGER', 8);
define('TOKEN_OCT_INT', 9);
define('TOKEN_HEX_INT', 10);
define('TOKEN_FLOAT', 11);
define('TOKEN_BUILTIN', 12);
define('TOKEN_HTML', 13);
define('TOKEN_PUNCTUATION', 14);
define('TOKEN_STRING_VAR', 15);
define('TOKEN_STRING_VAR_OPEN', 16);
define('TOKEN_HERE_DOC', 17);
define('TOKEN_VAR', 18);
define('TOKEN_BAD_CHAR', 19);
define('TOKEN_UNQUOTED_STRING', 20);
function phpcode_get_type($token) {
if (is_array($token)) {
$type = $token[0];
$token = $token[1];
}
else {
$type = $token;
}
switch ($type) {
case T_BAD_CHARACTER:
$type = TOKEN_BAD_CHAR;
break;
case T_WHITESPACE:
$type = TOKEN_WHITESPACE;
break;
case T_COMMENT:
case T_DOC_COMMENT:
//case T_ML_COMMENT: // php4 only
$type = TOKEN_COMMENT;
break;
case T_AND_EQUAL: case T_BOOLEAN_AND: case T_BOOLEAN_OR: case T_CONCAT_EQUAL:
case T_INSTANCEOF: case T_DEC: case T_DIV_EQUAL: case T_DOUBLE_ARROW:
case T_DOUBLE_COLON: case T_PAAMAYIM_NEKUDOTAYIM: // same as T_DOUBLE_COLON
case T_INC: case T_IS_EQUAL: case T_IS_GREATER_OR_EQUAL: case T_IS_IDENTICAL:
case T_IS_NOT_EQUAL: case T_IS_NOT_IDENTICAL: case T_IS_SMALLER_OR_EQUAL:
case T_LOGICAL_AND: case T_LOGICAL_OR: case T_NEW: case T_LOGICAL_XOR:
case T_MINUS_EQUAL: case T_MOD_EQUAL: case T_MUL_EQUAL: case T_OBJECT_OPERATOR:
case T_OR_EQUAL: case T_PLUS_EQUAL: case T_SL: case T_SL_EQUAL: case T_SR:
case T_SR_EQUAL: case T_XOR_EQUAL: case '=': case '!': case '%': case '^':
case '&': case '*': case '-': case '+': case '<': case '>': case '.': case '/':
case '@': case '~': case '|': case '?': case ':': case '[': case ']':
$type = TOKEN_OPERATOR;
break;
case T_ARRAY_CAST: case T_BOOL_CAST: case T_DOUBLE_CAST: case T_INT_CAST:
case T_OBJECT_CAST: case T_STRING_CAST: case T_UNSET_CAST:
$type = TOKEN_CAST;
break;
case T_ABSTRACT: case T_ARRAY: case T_AS: case T_BREAK: case T_CASE:
case T_CATCH: case T_CLASS: case T_CLONE: case T_CONST: case T_CONTINUE:
case T_DECLARE: case T_DEFAULT: case T_DO: case T_ECHO: case T_ELSE:
case T_ELSEIF: case T_ENDDECLARE: case T_ENDFOR: case T_ENDFOREACH: case T_ENDIF:
case T_ENDSWITCH: case T_ENDWHILE: case T_EXTENDS: case T_FINAL: case T_FOR:
case T_FOREACH: case T_FUNCTION: case T_GLOBAL: case T_IF: case T_IMPLEMENTS:
case T_INTERFACE: case T_PRIVATE: case T_PUBLIC: case T_PROTECTED: case T_RETURN:
case T_STATIC: case T_SWITCH: case T_THROW: case T_TRY: case T_USE: case T_VAR:
case T_WHILE: case T_METHOD:
$type = TOKEN_KEYWORD;
break;
case T_CLOSE_TAG: case T_OPEN_TAG: case T_OPEN_TAG_WITH_ECHO:
$type = TOKEN_PHPTAG;
break;
case T_NUM_STRING: case T_CONSTANT_ENCAPSED_STRING:
case T_ENCAPSED_AND_WHITESPACE: case '"': case '`': case "'": case T_CHARACTER:
$type = TOKEN_STRING;
break;
case T_STRING:
if (strtolower($token) == 'true'
|| strtolower($token) == 'false'
|| strtolower($token) == 'null') {
$type = TOKEN_KEYWORD;
}
else {
$type = TOKEN_UNQUOTED_STRING;
}
break;
case T_END_HEREDOC: case T_START_HEREDOC:
$type = TOKEN_HERE_DOC;
break;
case T_STRING_VARNAME:
$type = TOKEN_STRING_VAR;
break;
case T_DOLLAR_OPEN_CURLY_BRACES:
case T_CURLY_OPEN:
$type = TOKEN_STRING_VAR_OPEN;
break;
case T_LNUMBER:
// not anchoring the pattern to the end for octal is intentional
//(see the php documentation)
if (preg_match('/^0[0-7]+/', $token)) $type = TOKEN_OCT_INT;
elseif (preg_match('/^0[xX][0-9a-fA-F]+$/', $token)) $type = TOKEN_HEX_INT;
else $type = TOKEN_INTEGER;
break;
case T_DNUMBER:
$type = TOKEN_FLOAT;
break;
case T_EMPTY: case T_EVAL: case T_EXIT: case T_FILE: case T_LINE: case T_FUNC_C:
case T_CLASS_C: case T_HALT_COMPILER: case T_INCLUDE: case T_INCLUDE_ONCE:
case T_ISSET: case T_LIST: case T_PRINT: case T_REQUIRE: case T_REQUIRE_ONCE:
case T_UNSET:
$type = TOKEN_BUILTIN;
break;
case T_INLINE_HTML:
$type = TOKEN_HTML;
break;
case '(': case ')': case '{': case '}': case ',': case ';':
$type = TOKEN_PUNCTUATION;
break;
case T_VARIABLE:
$type = TOKEN_VAR;
break;
default:
$type = TOKEN_NORMAL;
}
return array($type, $token);
}
function phpcode_token_get_all($code, $stripComments=false) {
// single line comments do not eat the \n if code is in dos format.
$code = str_replace("\r", '', $code);
$tokens = token_get_all($code);
$newTokens = array();
$inStrVar = 0;
$insideString = false;
$endString = '';
foreach ($tokens as $key=>$token) {
list($type, $token) = phpcode_get_type($token);
if (($token == '`' && ! $inStrVar)
|| ($token == '"' && ! $inStrVar)
|| ($type == TOKEN_HERE_DOC)) {
if ($insideString && $type == $endString) {
$insideString = false;
$endStr = $type;
}
elseif ( ! $insideString) {
$insideString = true;
$endString = $type;
}
}
if (($type == TOKEN_BAD_CHAR)
|| ($stripComments && $type == TOKEN_COMMENT)) {
continue;
}
if ($type == TOKEN_UNQUOTED_STRING) {
if ($insideString) $type = TOKEN_STRING;
else $type = TOKEN_NORMAL;
}
if ($insideString) {
if ($type == TOKEN_VAR) {
$type = TOKEN_STRING_VAR;
}
elseif ($type == TOKEN_STRING_VAR_OPEN) {
$type = TOKEN_STRING_VAR;
++ $inStrVar;
}
elseif ($token == '}' && !$inStrVar) {
$type = TOKEN_STRING;
}
else {
$type = TOKEN_STRING;
}
if ($inStrVar) {
$type = TOKEN_STRING_VAR;
if ($token == '}') {
-- $inStrVar;
}
}
}
$newTokens[$key] = array($type, $token);
}
return $newTokens;
}
function phpcode_highlight($code, $stripComments=false, $compressWhitespace=false) {
$classList = array(
TOKEN_NORMAL => 'normal',
TOKEN_WHITESPACE => 'whitespace',
TOKEN_COMMENT => 'comment',
TOKEN_OPERATOR => 'operator',
TOKEN_CAST => 'cast',
TOKEN_KEYWORD => 'keyword',
TOKEN_PHPTAG => 'phptag',
TOKEN_STRING => 'string',
TOKEN_INTEGER => 'int number',
TOKEN_OCT_INT => 'oct int number',
TOKEN_HEX_INT => 'hex int number',
TOKEN_FLOAT => 'float number',
TOKEN_BUILTIN => 'keyword',
TOKEN_HTML => 'html',
TOKEN_PUNCTUATION => 'punctuation',
TOKEN_STRING_VAR => 'string variable',
TOKEN_HERE_DOC => 'string',
TOKEN_VAR => 'variable',
);
$code = phpcode_token_get_all($code, $stripComments);
$html = '';
$lastClass = null;
$temp = null;
foreach ($code as $token) { // [0] = type, [1] = token
list ($type, $str) = $token;
$class = $classList[$type];
if ($class == $lastClass) {
$temp .= $str; // store for later
}
else {
if ($compressWhitespace) {
$temp = preg_replace('/(\\n\\s+\\n)/', "\n", $temp);
}
if ($temp !== null) {
$html .= sprintf('<span class="%s">%s</span>', $lastClass,
htmlspecialchars($temp));
}
$temp = $str; // store for later
$lastClass = $class;
}
}
// there has to be an easy way to prevent duplicating this code...
if ($compressWhitespace) {
$temp = preg_replace('/(\\n\\s+\\n)/', "\n", $temp);
}
if ($temp) {
$html .= sprintf('<span class="%s">%s</span>', $lastClass,
htmlspecialchars($temp));
}
return $html;
}
?>
Download