Decision
I came up with a class to extract Doc comments for classes and methods in a file. Thanks to everyone who answered this question, and the other for matching code blocks .
The average benchmarks for the following example are from 0.00495 to 0.00505 seconds.
<?php $file = 'path/to/libraries/tokenizer.php'; include $file; $tokenizer = new Tokenizer;
Tokenizer (yes, I still haven't thought about a better name for it ...) Class:
<?php class Tokenizer { private $compiled = false, $path = false, $tokens = false, $classes = array(); public function load($path) { $path = realpath($path); if(!file_exists($path) || !function_exists('token_get_all')) { return false; } $this->compiled = false; $this->classes = array(); $this->path = $path; $this->tokens = false; $this->get_tokens(); $this->get_classes(); $this->class_blocks(); $this->class_functions(); return true; } protected function get_tokens() { $tokens = token_get_all(file_get_contents($this->path)); $compiled = ''; foreach($tokens as $k => $t) { if(is_array($t) && $t[0] != T_WHITESPACE) { $compiled .= $k . ':' . $t[0] . ','; } else { if($t == '{' || $t == '}') { $compiled .= $t . ','; } } } $this->tokens = $tokens; $this->compiled = trim($compiled, ','); } protected function get_classes() { if(!$this->compiled) { return false; } $regex = '%(?:(\\d+)\\:366,)?(?:\\d+\\:(?:345|344|353),)?\\d+\\:352,(\\d+)\\:307,(?:\\d+\\:(?:354|355),\\d+\\:307,)*{%'; preg_match_all($regex, $this->compiled, $classes, PREG_SET_ORDER); if(is_array($classes)) { foreach($classes as $class) { $this->classes[$this->tokens[$class[2]][1]] = array('token' => $class[2]); $this->classes[$this->tokens[$class[2]][1]]['doc'] = isset($this->tokens[$class[1]][1]) ? $this->tokens[$class[1]][1] : false; } } } private function class_blocks() { if(!$this->compiled) { return false; } foreach($this->classes as $class_name => $class) { $this->classes[$class_name]['block'] = $this->get_block($class['token']); } } protected function get_block($name_token) { if(!$this->compiled || ($pos = strpos($this->compiled, $name_token . ':')) === false) { return false; } $section= substr($this->compiled, $pos); $len = strlen($section); $block = ''; $opening = 1; $closing = 0; for($i = 0; $i < $len; $i++) { if($section[$i] == '{') { $opening++; } elseif($section[$i] == '}') { $closing++; if($closing == $opening) { break; } } if($opening > 0) { $block .= $section[$i]; } } return trim($block, ','); } protected function class_functions() { if(!$this->compiled) { return false; } foreach($this->classes as $class_name => $class) { $regex = '%(?:(\d+)\:366,)?(?:\d+\:(?:344|345),)?(?:\d+\:(?:341|342|343),)?\d+\:333,(\d+)\:307,\{%'; preg_match_all($regex, $class['block'], $functions, PREG_SET_ORDER); foreach($functions as $function) { $function_name = $this->tokens[$function[2]][1]; $this->classes[$class_name]['functions'][$function_name] = array('token' => $function[2]); $this->classes[$class_name]['functions'][$function_name]['doc'] = isset($this->tokens[$function[1]][1]) ? $this->tokens[$function[1]][1] : false; $this->classes[$class_name]['functions'][$function_name]['block'] = $this->get_block($function[2]); } } } public function get_doc($class, $function = false) { if(!is_string($class) || !isset($this->classes[$class])) { return false; } if(!is_string($function)) { return $this->classes[$class]['doc']; } else { if(!isset($this->classes[$class]['functions'][$function])) { return false; } return $this->classes[$class]['functions'][$function]['doc']; } } }
Any thoughts or comments on this? Any criticism is welcome!
Thanks, mniz.