使用XPath解析html
NSString *urlString = nil;
urlString = @"http://www.xiyou.edu.cn/new/lm.jsp?urltype=tree.TreeTempUrl&wbtreeid=724";
NSData *htmlData = [[NSData alloc] initWithContentsOfURL:[NSURL URLWithString:urlString]];
NSData *toHtmlData = [self toUTF8:htmlData];
TFHpple *xpathParser = [[TFHpple alloc] initWithHTMLData:toHtmlData];
NSArray *aArray = [xpathParser searchWithXPathQuery:@"//a"];
if ([span count] > 0) {
for (int i = 87; i < 102; i++) {
//从<a>的第82个开始取值,共获取15个值
TFHppleElement *aElement = [aArray objectAtIndex:i];
NSArray *aArr = [aElement children];
TFHppleElement *aEle = [aArr objectAtIndex:0];
NSArray *aChild = [aEle children];
TFHppleElement *aChildEle = [aChild objectAtIndex:0];
NSArray *aChildren = [aChildEle children];
NSString *aStr = [[aChildren objectAtIndex:0] content];
NSLog(@"aStr:%@",aStr);
NSDictionary *aAttributeDict = [aElement attributes];
NSLog(@"aAttributeDict:%@",aAttributeDict);
//获取a中的属性值
NSString *hrefStr = [NSString stringWithFormat:@"http://www.xiyou.edu.cn%@",[aAttributeDictobjectForKey:@"href"]];
NSLog(@"hrefStr:%@",hrefStr);
[currentNewsArr addObject:aStr];
[currentHrefArr addObject:hrefStr];
}
[htmlData release];
[xpathParser release];
}
//如果解析的网页不是utf8编码,如gbk编码,可以先将其转换为utf8编码再对其进行解析
-(NSData *) toUTF8:(NSData *)sourceData {
CFStringRef gbkStr = CFStringCreateWithBytes(NULL, [sourceData bytes], [sourceData length],kCFStringEncodingGB_18030_2000, false);
if (gbkStr == NULL) {
return nil;
} else {
NSString *gbkString = (NSString *)gbkStr;
//根据网页源代码中编码方式进行修改,此处为从gbk转换为utf8
NSString *utf8_String = [gbkString stringByReplacingOccurrencesOfString:@"META http-equiv="Content-Type" content="text/html; charset=GBK""
withString:@"META http-equiv="Content-Type" content="text/html; charset=UTF-8""];
return [utf8_String dataUsingEncoding:NSUTF8StringEncoding];
}
}