zoukankan      html  css  js  c++  java
  • iOS解析HTML

    xml,json都有大量的库来解析,我们如何解析html呢?


    TFHpple是一个小型的封装,可以用来解析html,它是对libxml的封装,语法是xpath。
    今天我看到一个直接用libxml来解析html,参看:http://www.cocoanetics.com/2011/09/taming-html-parsing-with-libxml-1/#comment-3090 那张图画得一目了然,很值得收藏。这个文章中的源码不能遍历所有的html,我做了一点修改可以将html遍历打印出来

    // NSData data contains the document data 
    // encoding is the NSStringEncoding of the data 
    // baseURL the documents base URL, i.e. location 

    CFStringEncoding cfenc = CFStringConvertNSStringEncodingToEncoding(encoding); 
    CFStringRef cfencstr = CFStringConvertEncodingToIANACharSetName(cfenc); 
    const char *enc = CFStringGetCStringPtr(cfencstr, 0); 

    htmlDocPtr _htmlDocument = htmlReadDoc([data bytes], 
    [[baseURL absoluteString] UTF8String], 
    enc, 
    XML_PARSE_NOERROR | XML_PARSE_NOWARNING); 
    if (_htmlDocument) 

    xmlFreeDoc(_htmlDocument); 


    xmlNodePtr currentNode = (xmlNodePtr)_htmlDocument; 

    while (currentNode) 

    // output node if it is an element 

    if (currentNode->type == XML_ELEMENT_NODE) 

    NSMutableArray *attrArray = [NSMutableArray array]; 

    for (xmlAttrPtr attrNode = currentNode->properties; attrNode; attrNode = attrNode->next) 

    xmlNodePtr contents = attrNode->children; 

    [attrArray addObject:[NSString stringWithFormat:@"%s='%s'", attrNode->name, contents->content]]; 


    NSString *attrString = [attrArray componentsJoinedByString:@" "]; 

    if ([attrString length]) 

    attrString = [@" " stringByAppendingString:attrString]; 


    NSLog(@"<%s%@>", currentNode->name, attrString); 

    else if (currentNode->type == XML_TEXT_NODE) 

    //NSLog(@"%s", currentNode->content); 
    NSLog(@"%@", [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]); 

    else if (currentNode->type == XML_COMMENT_NODE) 

    NSLog(@"/* %s */", currentNode->name); 



    if (currentNode && currentNode->children) 

    currentNode = currentNode->children; 

    else if (currentNode && currentNode->next) 

    currentNode = currentNode->next; 

    else 

    currentNode = currentNode->parent; 

    // close node 
    if (currentNode && currentNode->type == XML_ELEMENT_NODE) 

    NSLog(@"", currentNode->name); 


    if (currentNode->next) 

    currentNode = currentNode->next; 

    else 

    while(currentNode) 

    currentNode = currentNode->parent; 
    if (currentNode && currentNode->type == XML_ELEMENT_NODE) 

    NSLog(@"", currentNode->name); 
    if (strcmp((const char *)currentNode->name, "table") == 0) 

    NSLog(@"over"); 



    if (currentNode == nodes->nodeTab[0]) 

    break; 


    if (currentNode && currentNode->next) 

    currentNode = currentNode->next; 
    break; 





    if (currentNode == nodes->nodeTab[0]) 

    break; 

    }


    不过我还是喜欢用TFHpple,因为它很简单,也好用,但是它的功能不是很完完善。比如,不能获取children node,我就写了两个方法,一个是获取children node,一个是获取所有的contents. 还有node的属性content的key与node's content的key一样,都是@"nodeContent", 正确情况下属性的应是@"attributeContent",
    所以我写了这个方法,同时修改node属性的content key.
    NSDictionary *DictionaryForNode2(xmlNodePtr currentNode, NSMutableDictionary *parentResult) 

    NSMutableDictionary *resultForNode = [NSMutableDictionary dictionary]; 

    if (currentNode->name) 

    NSString *currentNodeContent = 
    [NSString stringWithCString:(const char *)currentNode->name encoding:NSUTF8StringEncoding]; 
    [resultForNode setObject:currentNodeContent forKey:@"nodeName"]; 


    if (currentNode->content) 

    NSString *currentNodeContent = [NSString stringWithCString:(const char *)currentNode->content encoding:NSUTF8StringEncoding]; 

    if (currentNode->type == XML_TEXT_NODE) 

    if (currentNode->parent->type == XML_ELEMENT_NODE) 

    [parentResult setObject:currentNodeContent forKey:@"nodeContent"]; 
    return nil; 


    if (currentNode->parent->type == XML_ATTRIBUTE_NODE) 

    [parentResult 
    setObject: 
    [currentNodeContent 
    stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]] 
    forKey:@"attributeContent"]; 
    return nil; 







    xmlAttr *attribute = currentNode->properties; 
    if (attribute) 

    NSMutableArray *attributeArray = [NSMutableArray array]; 
    while (attribute) 

    NSMutableDictionary *attributeDictionary = [NSMutableDictionary dictionary]; 
    NSString *attributeName = 
    [NSString stringWithCString:(const char *)attribute->name encoding:NSUTF8StringEncoding]; 
    if (attributeName) 

    [attributeDictionary setObject:attributeName forKey:@"attributeName"]; 


    if (attribute->children) 

    NSDictionary *childDictionary = DictionaryForNode2(attribute->children, attributeDictionary); 
    if (childDictionary) 

    [attributeDictionary setObject:childDictionary forKey:@"attributeContent"]; 



    if ([attributeDictionary count] > 0) 

    [attributeArray addObject:attributeDictionary]; 

    attribute = attribute->next; 


    if ([attributeArray count] > 0) 

    [resultForNode setObject:attributeArray forKey:@"nodeAttributeArray"]; 



    xmlNodePtr childNode = currentNode->children; 
    if (childNode) 

    NSMutableArray *childContentArray = [NSMutableArray array]; 
    while (childNode) 

    NSDictionary *childDictionary = DictionaryForNode2(childNode, resultForNode); 
    if (childDictionary) 

    [childContentArray addObject:childDictionary]; 

    childNode = childNode->next; 

    if ([childContentArray count] > 0) 

    [resultForNode setObject:childContentArray forKey:@"nodeChildArray"]; 



    return resultForNode; 
    }

    TFHppleElement.m里加了两个key 常量
    NSString * const TFHppleNodeAttributeContentKey = @"attributeContent"; 
    NSString * const TFHppleNodeChildArrayKey = @"nodeChildArray";

    并修改获取属性方法为:
    - (NSDictionary *) attributes 

    NSMutableDictionary * translatedAttributes = [NSMutableDictionary dictionary]; 
    for (NSDictionary * attributeDict in [node objectForKey:TFHppleNodeAttributeArrayKey]) { 
    [translatedAttributes setObject:[attributeDict objectForKey:TFHppleNodeAttributeContentKey] 
    forKey:[attributeDict objectForKey:TFHppleNodeAttributeNameKey]]; 

    return translatedAttributes; 
    }

    并添加获取children node 方法:
    - (BOOL) hasChildren 

    NSArray *childs = [node objectForKey: TFHppleNodeChildArrayKey]; 

    if (childs) 

    return YES; 


    return NO; 


    - (NSArray *) children 

    if ([self hasChildren]) 
    return [node objectForKey: TFHppleNodeChildArrayKey]; 
    return nil; 
    }

    参看:http://giles-wang.blogspot.com/2011/08/iphoneansi.html

    原文:http://blog.csdn.net/favormm/article/details/6794487

  • 相关阅读:
    uva1610 Party Games
    uva1442 Cav
    uva1609 Foul Play
    uva1608 Non-boring sequences
    uva12174 滑动窗口+预处理
    uva 1451 数形结合
    light oj 1336 sigma function
    找常用词(字符串处理)问题
    指定排序问题
    完数问题
  • 原文地址:https://www.cnblogs.com/fakemessi/p/4900893.html
Copyright © 2011-2022 走看看