MichaelCrumley.com

PHP Code Highlighter

Posted: August 05, 2007
Categories: PHP

I don't like the way the built in function to colorize PHP code works, so I wrote my own PHP code highlighter.

I don't plan to write an explanation for how this works. If you are curious, just look at the code. The string handling can be a little confusing sometimes because of all the special cases, but the rest is pretty simple.

<?php
// this is not defined in my version of php
if (! defined('T_METHOD')) define('T_METHOD', 361);

define('TOKEN_NORMAL', 0);
define('TOKEN_WHITESPACE', 1);
define('TOKEN_COMMENT', 2);
define('TOKEN_OPERATOR', 3);
define('TOKEN_CAST', 4);
define('TOKEN_KEYWORD', 5);
define('TOKEN_PHPTAG', 6);
define('TOKEN_STRING', 7);
define('TOKEN_INTEGER', 8);
define('TOKEN_OCT_INT', 9);
define('TOKEN_HEX_INT', 10);
define('TOKEN_FLOAT', 11);
define('TOKEN_BUILTIN', 12);
define('TOKEN_HTML', 13);
define('TOKEN_PUNCTUATION', 14);
define('TOKEN_STRING_VAR', 15);
define('TOKEN_STRING_VAR_OPEN', 16);
define('TOKEN_HERE_DOC', 17);
define('TOKEN_VAR', 18);
define('TOKEN_BAD_CHAR', 19);
define('TOKEN_UNQUOTED_STRING', 20);

function phpcode_get_type($token) {
    if (is_array($token)) {
        $type = $token[0];
        $token = $token[1];
    }
    else {
        $type = $token;
    }
    switch ($type) {
    case T_BAD_CHARACTER:
        $type = TOKEN_BAD_CHAR;
        break;
    case T_WHITESPACE:
        $type = TOKEN_WHITESPACE;
        break;
    case T_COMMENT:
    case T_DOC_COMMENT:
    //case T_ML_COMMENT: // php4 only
        $type = TOKEN_COMMENT;
        break;
    case T_AND_EQUAL: case T_BOOLEAN_AND: case T_BOOLEAN_OR: case T_CONCAT_EQUAL:
    case T_INSTANCEOF: case T_DEC: case T_DIV_EQUAL: case T_DOUBLE_ARROW:
    case T_DOUBLE_COLON: case T_PAAMAYIM_NEKUDOTAYIM: // same as T_DOUBLE_COLON
    case T_INC: case T_IS_EQUAL: case T_IS_GREATER_OR_EQUAL: case T_IS_IDENTICAL:
    case T_IS_NOT_EQUAL: case T_IS_NOT_IDENTICAL: case T_IS_SMALLER_OR_EQUAL:
    case T_LOGICAL_AND: case T_LOGICAL_OR: case T_NEW: case T_LOGICAL_XOR:
    case T_MINUS_EQUAL: case T_MOD_EQUAL: case T_MUL_EQUAL: case T_OBJECT_OPERATOR:
    case T_OR_EQUAL: case T_PLUS_EQUAL: case T_SL: case T_SL_EQUAL: case T_SR:
    case T_SR_EQUAL: case T_XOR_EQUAL: case '=': case '!': case '%': case '^':
    case '&': case '*': case '-': case '+': case '<': case '>': case '.': case '/':
    case '@': case '~': case '|': case '?': case ':': case '[': case ']':
        $type = TOKEN_OPERATOR;
        break;
    case T_ARRAY_CAST: case T_BOOL_CAST: case T_DOUBLE_CAST: case T_INT_CAST:
    case T_OBJECT_CAST: case T_STRING_CAST: case T_UNSET_CAST:
        $type = TOKEN_CAST;
        break;
    case T_ABSTRACT: case T_ARRAY: case T_AS: case T_BREAK: case T_CASE:
    case T_CATCH: case T_CLASS: case T_CLONE: case T_CONST: case T_CONTINUE:
    case T_DECLARE: case T_DEFAULT: case T_DO: case T_ECHO: case T_ELSE:
    case T_ELSEIF: case T_ENDDECLARE: case T_ENDFOR: case T_ENDFOREACH: case T_ENDIF:
    case T_ENDSWITCH: case T_ENDWHILE: case T_EXTENDS: case T_FINAL: case T_FOR:
    case T_FOREACH: case T_FUNCTION: case T_GLOBAL: case T_IF: case T_IMPLEMENTS:
    case T_INTERFACE: case T_PRIVATE: case T_PUBLIC: case T_PROTECTED: case T_RETURN:
    case T_STATIC: case T_SWITCH: case T_THROW: case T_TRY: case T_USE: case T_VAR:
    case T_WHILE: case T_METHOD:
        $type = TOKEN_KEYWORD;
        break;
    case T_CLOSE_TAG: case T_OPEN_TAG: case T_OPEN_TAG_WITH_ECHO:
        $type = TOKEN_PHPTAG;
        break;
    case T_NUM_STRING: case T_CONSTANT_ENCAPSED_STRING:
    case T_ENCAPSED_AND_WHITESPACE: case '"': case '`': case "'": case T_CHARACTER:
        $type = TOKEN_STRING;
        break;
    case T_STRING:
        if (strtolower($token) == 'true'
         || strtolower($token) == 'false'
         || strtolower($token) == 'null') {
            $type = TOKEN_KEYWORD;
        }
        else {
            $type = TOKEN_UNQUOTED_STRING;
        }
        break;
    case T_END_HEREDOC: case T_START_HEREDOC:
        $type = TOKEN_HERE_DOC;
        break;
    case T_STRING_VARNAME:
        $type = TOKEN_STRING_VAR;
        break;
    case T_DOLLAR_OPEN_CURLY_BRACES:
    case T_CURLY_OPEN:
        $type = TOKEN_STRING_VAR_OPEN;
        break;
    case T_LNUMBER:
        // not anchoring the pattern to the end for octal is intentional
        //(see the php documentation)
        if (preg_match('/^0[0-7]+/', $token)) $type = TOKEN_OCT_INT;
        elseif (preg_match('/^0[xX][0-9a-fA-F]+$/', $token)) $type = TOKEN_HEX_INT;
        else $type = TOKEN_INTEGER;
        break;
    case T_DNUMBER:
        $type = TOKEN_FLOAT;
        break;
    case T_EMPTY: case T_EVAL: case T_EXIT: case T_FILE: case T_LINE: case T_FUNC_C:
    case T_CLASS_C: case T_HALT_COMPILER: case T_INCLUDE: case T_INCLUDE_ONCE:
    case T_ISSET: case T_LIST: case T_PRINT: case T_REQUIRE: case T_REQUIRE_ONCE:
    case T_UNSET:
        $type = TOKEN_BUILTIN;
        break;
    case T_INLINE_HTML:
        $type = TOKEN_HTML;
        break;
    case '(': case ')': case '{': case '}': case ',': case ';':
        $type = TOKEN_PUNCTUATION;
        break;
    case T_VARIABLE:
        $type = TOKEN_VAR;
        break;
    default:
        $type = TOKEN_NORMAL;
    }
    return array($type, $token);
}

function phpcode_token_get_all($code, $stripComments=false) {
    // single line comments do not eat the \n if code is in dos format.
    $code = str_replace("\r", '', $code);
    $tokens = token_get_all($code);
    $newTokens = array();
    $inStrVar = 0;
    $insideString = false;
    $endString = '';
    foreach ($tokens as $key=>$token) {
        list($type, $token) = phpcode_get_type($token);
        if (($token == '`' && ! $inStrVar)
         || ($token == '"' && ! $inStrVar)
         || ($type == TOKEN_HERE_DOC)) {
            if ($insideString && $type == $endString) {
                $insideString = false;
                $endStr = $type;
            }
            elseif ( ! $insideString) {
                $insideString = true;
                $endString = $type;
            }
        }
        if (($type == TOKEN_BAD_CHAR)
         || ($stripComments && $type == TOKEN_COMMENT)) {
            continue;
        }
        if ($type == TOKEN_UNQUOTED_STRING) {
            if ($insideString) $type = TOKEN_STRING;
            else $type = TOKEN_NORMAL;
        }
        if ($insideString) {
            if ($type == TOKEN_VAR) {
                $type = TOKEN_STRING_VAR;
            }
            elseif ($type == TOKEN_STRING_VAR_OPEN) {
                $type = TOKEN_STRING_VAR;
                ++ $inStrVar;
            }
            elseif ($token == '}' && !$inStrVar) {
                $type = TOKEN_STRING;
            }
            else {
                $type = TOKEN_STRING;
            }
            if ($inStrVar) {
                $type = TOKEN_STRING_VAR;
                if ($token == '}') {
                    -- $inStrVar;
                }
            }
        }
        $newTokens[$key] = array($type, $token);
    }
    return $newTokens;
}

function phpcode_highlight($code, $stripComments=false, $compressWhitespace=false) {
$classList = array(
    TOKEN_NORMAL => 'normal',
    TOKEN_WHITESPACE => 'whitespace',
    TOKEN_COMMENT => 'comment',
    TOKEN_OPERATOR => 'operator',
    TOKEN_CAST => 'cast',
    TOKEN_KEYWORD => 'keyword',
    TOKEN_PHPTAG => 'phptag',
    TOKEN_STRING => 'string',
    TOKEN_INTEGER => 'int number',
    TOKEN_OCT_INT => 'oct int number',
    TOKEN_HEX_INT => 'hex int number',
    TOKEN_FLOAT => 'float number',
    TOKEN_BUILTIN => 'keyword',
    TOKEN_HTML => 'html',
    TOKEN_PUNCTUATION => 'punctuation',
    TOKEN_STRING_VAR => 'string variable',
    TOKEN_HERE_DOC => 'string',
    TOKEN_VAR => 'variable',
);
    $code = phpcode_token_get_all($code, $stripComments);
    $html = '';
    $lastClass = null;
    $temp = null;
    foreach ($code as $token) { // [0] = type, [1] = token
        list ($type, $str) = $token;
        $class = $classList[$type];
        if ($class == $lastClass) {
            $temp .= $str; // store for later
        }
        else {
            if ($compressWhitespace) {
                $temp = preg_replace('/(\\n\\s+\\n)/', "\n", $temp);
            }
            if ($temp !== null) {
                $html .= sprintf('<span class="%s">%s</span>', $lastClass,
                    htmlspecialchars($temp));
            }
            $temp = $str; // store for later
            $lastClass = $class;
        }
    }
    // there has to be an easy way to prevent duplicating this code...
    if ($compressWhitespace) {
        $temp = preg_replace('/(\\n\\s+\\n)/', "\n", $temp);
    }
    if ($temp) {
        $html .= sprintf('<span class="%s">%s</span>', $lastClass,
            htmlspecialchars($temp));
    }
    return $html;
}
?>
Download