代码之家  ›  专栏  ›  技术社区  ›  dclowd9901

将html dom转换为多维数组

  •  1
  • dclowd9901  · 技术社区  · 14 年前

    其中键由元素类型表示,值由 #foo .bar (间隔和准备 explode() )是可能的,还是有什么东西存在?

    我知道这个问题可能会激起一些愤怒,我希望没有人链接到那篇关于解析html的文章,但我希望这不是不可能的。谢谢你的帮助。

    附录:理想情况下,会使用php,因为它是我所知道的唯一一种脚本语言。

    1 回复  |  直到 10 年前
        1
  •  0
  •   dclowd9901    14 年前

    感谢您的帮助:\此函数将把一个html主体转换为一个多维数组,该数组包含属性、类和id。

    <?php
    
    function htmlArrayer($raw_html){
    
        $match_open = '/\<(?!\/)(.+?)\>/';
        $match_closed = '/\<\/(.+?)\>/';
        $match_open_or_closed = '/(\<(\/?[^\>]+)\>)/';
        $match_scripts = '@<script[^>]*?>.*?</script>@si';
        $match_styles = '@<style[^>]*?>.*?</style>@siU';
        $match_element = '/(?<=\<\s*)[a-zA-Z](?=\s+)/';
        $match_comments = '/<!--.*?-->/si';
        $match_class = '/(?<=(class\=")).+?(?=")/';
        $match_id = '/(?<=(id\=")).+?(?=")/';
    
        $raw_html = preg_replace($match_scripts, '', $raw_html);
        $raw_html = preg_replace($match_styles, '', $raw_html);
        $raw_html = preg_replace($match_comments, '', $raw_html);
        $raw_html = str_replace('>', '> ', $raw_html);
        $raw_html = str_replace('<', ' <', $raw_html);
        $raw_html = str_replace('!--', '!-- ', $raw_html);
        $raw_html = preg_replace('/[ \t\r\n]/', ' ', $raw_html);
        preg_match_all($match_open_or_closed, $raw_html, $matches);
        $matches[2] = checkTags($matches[2]);   
        $html_array = htmlToArray($matches[2], 0);
    
        return $html_array;
    
    }
    
    function checkTags($htmlArray) {
        $valid_tags_array = array('html', 'body', 'div', 'span', 'applet', 'object', 'iframe', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'blockquote', 'pre', 'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'code', 'del', 'dfn', 'em', 'font', 'img', 'ins', 'kbd', 'q', 's', 'samp', 'small', 'strike', 'strong', 'sub', 'sup', 'tt', 'var', 'b', 'u', 'i', 'center', 'dl', 'dt', 'dd', 'ol', 'ul', 'li','fieldset', 'form', 'label', 'legend', 'table', 'caption', 'tbody', 'tfoot', 'thead', 'tr', 'th', 'td');
    
        foreach($htmlArray as $key => $element) {
            $notfound = true;
            $element = explode(' ', trim($element));
    
            foreach($valid_tags_array as $tag) {
                if($tag == $element[0] || '/' . $tag == $element[0]){
                    $notfound = false;
                    break;
                }
            }
    
            if($notfound != false){
                $htmlArray[$key] = 'br';
            }
        }
    
        return $htmlArray;
    }
    
    function htmlToArray($untiered_array, $index){
        $untiered_element = explode(' ', $untiered_array[$index]);
        if($untiered_element[0] == 'br'){
            $index++;
            $untiered_element = explode(' ', $untiered_array[$index]);
        }
    
        $css_string = attrToCSS($untiered_array[$index]);
        $untiered_array[$index] = $untiered_element[0] . ' ' . $css_string;
    
        $new_array_layer = array($untiered_array[$index]);
        $tier_check = 0;
    
        // Loops through every remaining element from the $index forward
        for($i = $index + 1; $untiered_array[$i] != '/' . $untiered_element[0] || $tier_check != 0; $i++){
            $one_way_elements = array('br', 'img', 'area', 'base', 'basefront', 'hr', 'input', 'link', 'meta', 'col', 'embed', 'param');
            $element_check = true;
            $next_element_name = explode(' ', $untiered_array[$i]);
    
            foreach($one_way_elements as $this_element){
                if($this_element == $next_element_name[0]){
                    $element_check = false;
                    break;
                }
            }
    
            // if it *is* the self-closing type, create a 1d array for it.
            if($element_check == false) {
                $tier_check++;
                if($tier_check == 1) {
                    $untiered_standalone = explode(' ', $untiered_array[$i]);
                    $css_string = attrToCSS($untiered_array[$i]);
                    $untiered_array[$i] = $untiered_standalone[0] . ' ' . $css_string;
    
                    $new_array_layer[] = array($untiered_array[$i]);
                }
                $tier_check--;
            }
    
            // If the following element is not preceded by a '/' and is not self-closing, continue 
            if((strpos($untiered_array[$i], '/') != 0 || strpos($untiered_array[$i], '/') === false) && $element_check == true){
                $tier_check++;
    
                // If the next element is only one tier above this element (as in its direct child), reiterate
                if($tier_check == 1){       
                    $new_array_layer[] = htmlToArray($untiered_array, $i);
                }                       
            }
    
            // If the next element *does* begin with a closing slash
            if(strpos($untiered_array[$i], '/') === 0){
                $tier_check--;
            }
        }
    
        return $new_array_layer;
    }
    
    function attrToCSS($attr_string){
    
        preg_match_all('/(?<=(class\=")).+?(?=")/', $attr_string, $class_value);
        $class_value_string = $class_value[0][0];
    
        preg_match_all('/(?<=(id\=")).+?(?=")/', $attr_string, $id_value);
        $id_value_string = $id_value[0][0];
    
        if($class_value_string != ''){
            $class_value_array = explode(' ', $class_value_string);
    
            foreach($class_value_array as $index => $class) {
                $class_value_array[$index] = '.' . $class;
            }
            $class_id_string = implode(' ', $class_value_array);
        } 
    
        if ($id_value_string != '') {
            $class_id_string = '#' . $id_value_string;
        }
    
        return $class_id_string;
    }
    
    
    ?>