zoukankan      html  css  js  c++  java
  • iOS 解析 HTML

    xml,json都有大量的库来解析,我们如何解析html呢?

    TFHpple是一个小型的封装,可以用来解析html,它是对libxml的封装,语法是xpath。

    今天我看到一个直接用libxml来解析html,参看:http://www.cocoanetics.com/2011/09/taming-html-parsing-with-libxml-1/#comment-3090 那张图画得一目了然,很值得收藏。这个文章中的源码不能遍历所有的html,我做了一点修改可以将html遍历打印出来

    001 // NSData data contains the document data 
    002 // encoding is the NSStringEncoding of the data 
    003 // baseURL the documents base URL, i.e. location  
    004     
    005 CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(encoding); 
    006 CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc); 
    007 const char*enc = CFStringGetCStringPtr(cfencstr, 0); 
    008     
    009 htmlDocPtr _htmlDocument = htmlReadDoc([data bytes], 
    010       [[baseURL absoluteString] UTF8String], 
    011       enc, 
    012       XML_PARSE_NOERROR | XML_PARSE_NOWARNING); 
    013 if(_htmlDocument) 
    014
    015    xmlFreeDoc(_htmlDocument); 
    016
    017    
    018 xmlNodePtr currentNode = (xmlNodePtr)_htmlDocument; 
    019    
    020 while(currentNode)  
    021     
    022         // output node if it is an element 
    023            
    024         if(currentNode->type == XML_ELEMENT_NODE) 
    025         
    026             NSMutableArray *attrArray = [NSMutableArray array]; 
    027                
    028             for(xmlAttrPtr attrNode = currentNode->properties; attrNode; attrNode = attrNode->next) 
    029             
    030                 xmlNodePtr contents = attrNode->children; 
    031                    
    032                 [attrArray addObject:[NSString stringWithFormat:@"%s='%s'", attrNode->name, contents->content]]; 
    033             
    034                
    035             NSString *attrString = [attrArray componentsJoinedByString:@" "];  
    036                
    037             if([attrString length]) 
    038             
    039                 attrString = [@" "stringByAppendingString:attrString]; 
    040             
    041                
    042             NSLog(@"<%s%@>", currentNode->name, attrString); 
    043         
    044         else if(currentNode->type == XML_TEXT_NODE) 
    045         
    046             //NSLog(@"%s", currentNode->content); 
    047             NSLog(@"%@", [NSString stringWithCString:(const char*)currentNode->content encoding:NSUTF8StringEncoding]); 
    048         
    049         else if(currentNode->type == XML_COMMENT_NODE) 
    050         
    051             NSLog(@"/* %s */", currentNode->name); 
    052         
    053        
    054            
    055         if(currentNode && currentNode->children) 
    056         
    057             currentNode = currentNode->children; 
    058         
    059         else if(currentNode && currentNode->next) 
    060         
    061             currentNode = currentNode->next; 
    062         
    063         else 
    064         
    065             currentNode = currentNode->parent; 
    066                
    067             // close node 
    068             if(currentNode && currentNode->type == XML_ELEMENT_NODE) 
    069             
    070                 NSLog(@"</%s>", currentNode->name); 
    071             
    072                
    073             if(currentNode->next) 
    074             
    075                 currentNode = currentNode->next; 
    076             
    077             else  
    078             
    079                 while(currentNode) 
    080                 
    081                     currentNode = currentNode->parent; 
    082                     if(currentNode && currentNode->type == XML_ELEMENT_NODE) 
    083                     
    084                         NSLog(@"</%s>", currentNode->name); 
    085                         if (strcmp((const char*)currentNode->name, "table") == 0) 
    086                         
    087                             NSLog(@"over"); 
    088                         
    089                     
    090                        
    091                     if(currentNode == nodes->nodeTab[0]) 
    092                     
    093                         break
    094                     
    095                        
    096                     if(currentNode && currentNode->next) 
    097                     
    098                         currentNode = currentNode->next; 
    099                         break
    100                     
    101                 
    102             
    103         
    104            
    105         if(currentNode == nodes->nodeTab[0]) 
    106         
    107             break
    108         
    109     }

    不过我还是喜欢用TFHpple,因为它很简单,也好用,但是它的功能不是很完完善。比如,不能获取children node,我就写了两个方法,一个是获取children node,一个是获取所有的contents.  还有node的属性content的key与node's content的key一样,都是@"nodeContent", 正确情况下属性的应是@"attributeContent",

    所以我写了这个方法,同时修改node属性的content key.

    01 NSDictionary *DictionaryForNode2(xmlNodePtr currentNode, NSMutableDictionary *parentResult) 
    02
    03     NSMutableDictionary *resultForNode = [NSMutableDictionary dictionary]; 
    04        
    05     if(currentNode->name) 
    06     
    07         NSString *currentNodeContent = 
    08         [NSString stringWithCString:(const char*)currentNode->name encoding:NSUTF8StringEncoding]; 
    09         [resultForNode setObject:currentNodeContent forKey:@"nodeName"]; 
    10     
    11        
    12     if(currentNode->content) 
    13     
    14         NSString *currentNodeContent = [NSString stringWithCString:(const char*)currentNode->content encoding:NSUTF8StringEncoding]; 
    15            
    16         if(currentNode->type == XML_TEXT_NODE) 
    17         
    18             if(currentNode->parent->type == XML_ELEMENT_NODE) 
    19             
    20                 [parentResult setObject:currentNodeContent forKey:@"nodeContent"]; 
    21                 returnnil; 
    22             
    23                
    24             if(currentNode->parent->type == XML_ATTRIBUTE_NODE) 
    25             
    26                 [parentResult 
    27                  setObject: 
    28                  [currentNodeContent 
    29                   stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]] 
    30                  forKey:@"attributeContent"]; 
    31                 returnnil; 
    32    
    33             
    34         
    35     
    36        
    37    
    38        
    39     xmlAttr *attribute = currentNode->properties; 
    40     if(attribute) 
    41     
    42         NSMutableArray *attributeArray = [NSMutableArray array]; 
    43         while(attribute) 
    44         
    45             NSMutableDictionary *attributeDictionary = [NSMutableDictionary dictionary]; 
    46             NSString *attributeName = 
    47             [NSString stringWithCString:(const char*)attribute->name encoding:NSUTF8StringEncoding]; 
    48             if(attributeName) 
    49             
    50                 [attributeDictionary setObject:attributeName forKey:@"attributeName"]; 
    51             
    52                
    53             if(attribute->children) 
    54             
    55                 NSDictionary *childDictionary = DictionaryForNode2(attribute->children, attributeDictionary); 
    56                 if(childDictionary) 
    57                 
    58                     [attributeDictionary setObject:childDictionary forKey:@"attributeContent"]; 
    59                 
    60             
    61                
    62             if([attributeDictionary count] > 0) 
    63             
    64                 [attributeArray addObject:attributeDictionary]; 
    65             
    66             attribute = attribute->next; 
    67         
    68            
    69         if([attributeArray count] > 0) 
    70         
    71             [resultForNode setObject:attributeArray forKey:@"nodeAttributeArray"]; 
    72         
    73     
    74        
    75     xmlNodePtr childNode = currentNode->children; 
    76     if(childNode) 
    77     
    78         NSMutableArray *childContentArray = [NSMutableArray array]; 
    79         while(childNode) 
    80         
    81             NSDictionary *childDictionary = DictionaryForNode2(childNode, resultForNode); 
    82             if(childDictionary) 
    83             
    84                 [childContentArray addObject:childDictionary]; 
    85             
    86             childNode = childNode->next; 
    87         
    88         if([childContentArray count] > 0) 
    89         
    90             [resultForNode setObject:childContentArray forKey:@"nodeChildArray"]; 
    91         
    92     
    93        
    94     returnresultForNode; 
    95 }

    TFHppleElement.m里加了两个key 常量

    1 NSString * constTFHppleNodeAttributeContentKey  = @"attributeContent"
    2 NSString * const TFHppleNodeChildArrayKey        = @"nodeChildArray";

    并修改获取属性方法为:

    1 - (NSDictionary *) attributes 
    2
    3   NSMutableDictionary * translatedAttributes = [NSMutableDictionary dictionary]; 
    4   for(NSDictionary * attributeDict in [node objectForKey:TFHppleNodeAttributeArrayKey]) { 
    5     [translatedAttributes setObject:[attributeDict objectForKey:TFHppleNodeAttributeContentKey] 
    6                              forKey:[attributeDict objectForKey:TFHppleNodeAttributeNameKey]]; 
    7   
    8   returntranslatedAttributes; 
    9 }

    并添加获取children node 方法:

    01 - (BOOL) hasChildren 
    02
    03     NSArray *childs = [node objectForKey: TFHppleNodeChildArrayKey]; 
    04        
    05     if(childs)  
    06     
    07         return YES; 
    08     
    09        
    10     return NO; 
    11
    12    
    13 - (NSArray *) children 
    14
    15     if([self hasChildren]) 
    16         return[node objectForKey: TFHppleNodeChildArrayKey]; 
    17     returnnil; 
    18 }

    最后我还加了一个获取所有content的主法:

    1 - (NSString *)contentsAt:(NSString *)xPathOrCss;

    请看 源码

  • 相关阅读:
    关于产品那些事
    关于“编程的本质”的探讨
    分享一款在线贝塞尔曲线调试器
    HTML、CSS、JS对unicode字符的不同处理
    HTTP Content-Disposition Explanation [ from MDN ]
    认证 (authentication) 和授权 (authorization) 的区别
    事件驱动引擎会取代多线程编程吗
    你所不知道的JSON
    都有哪些特殊而实用的的搜索引擎?
    巨头们的GitHub仓库整理
  • 原文地址:https://www.cnblogs.com/greywolf/p/2861636.html
Copyright © 2011-2022 走看看