zoukankan html css js c++ java

1)⑤爬取搜狗旅游部分新闻

 1 __author__ = 'minmin'
 2 #coding:utf-8
 3 import re,urllib,sgmllib
 4 
 5 #根据当前的url获取html
 6 def getHtml(url):
 7     page = urllib.urlopen(url)
 8     html = page.read()
 9     page.close()
10     return html
11 
12 #根据html获取想要的文章内容
13 def func(str):
14      result =   re.findall(r"<p.*?>([^<>]*)</p>",getHtml(url),re.M)
15      artical =''
16 
17      for j in result:
18          if len(j)<>0:
19              j = j.replace("<strong>","    ")
20              j = j.replace("</strong>","    ")
21              j = j.replace("<br>","   ")
22              j = j.replace("&nbsp;"," ")
23              j = j.replace("&ldquo;"," ")
24              j = j.replace("&rdquo;"," ")
25              j = j.replace("&middot;"," ")
26              artical = artical + j + '
'
27      return  artical
28 
29 #html链接的标签是“a”，链接的属性是“href”，也就是要获得html中所有tag=a，attrs=href 值。
30 class URLPaser(sgmllib.SGMLParser):
31     def reset(self):
32         sgmllib.SGMLParser.reset(self)
33         self.urls = []
34 
35     def start_a(self,attrs):
36         href = [v for k,v in attrs if k == 'href']
37         if href:
38             self.urls.extend(href)
39 
40 IParser = URLPaser()
41 socket = urllib.urlopen("http://travel.sohu.com/lvyouxinwen.shtml")#打开这个网页
42 
43 #fout = file('qq_art_urls.txt','w')#要把这个链接写到这个文件中
44 IParser.feed(socket.read())#分析啦
45 
46 reg = 'http://travel.sohu.com/2015.*'#这个是用来匹配符合条件的链接，使用正则表达式匹配
47 
48 pattern = re.compile(reg)
49 i = 0
50 url2 = []
51 for url in IParser.urls:#链接都存在urls里
52     if pattern.match(url):
53         if url not in url2:
54             url2.append(url)
55             print url
56             artical = func(url)
57             print artical
58             if len(artical)<>0:
59                   i = i + 1
60                   f = open("sougou/Travel/"+str(i) + '.txt','a+')
61                   f.write(artical)
62                   f.close()

查看全文

相关阅读:
分布式锁
 zookeeper
工作流笔记第四天_流程变量
 工作流笔记第三天_流程实例
 工作流笔记第二天_流程定义的CRUD
工作流笔记第一天_简单介绍activiti
groovy修改代码不用重启通过监听记录改变时间重新加载
 遇到的前端问题
 常用正则表达式大全
 Hibernate中Session.get()方法和load()方法的详细比较

原文地址：https://www.cnblogs.com/minmsy/p/4962745.html