zoukankan      html  css  js  c++  java
  • 1)⑤爬取搜狗旅游部分新闻

     1 __author__ = 'minmin'
     2 #coding:utf-8
     3 import re,urllib,sgmllib
     4 
     5 #根据当前的url获取html
     6 def getHtml(url):
     7     page = urllib.urlopen(url)
     8     html = page.read()
     9     page.close()
    10     return html
    11 
    12 #根据html获取想要的文章内容
    13 def func(str):
    14      result =   re.findall(r"<p.*?>([^<>]*)</p>",getHtml(url),re.M)
    15      artical =''
    16 
    17      for j in result:
    18          if len(j)<>0:
    19              j = j.replace("<strong>","    ")
    20              j = j.replace("</strong>","    ")
    21              j = j.replace("<br>","   ")
    22              j = j.replace("&nbsp;"," ")
    23              j = j.replace("&ldquo;"," ")
    24              j = j.replace("&rdquo;"," ")
    25              j = j.replace("&middot;"," ")
    26              artical = artical + j + '
    '
    27      return  artical
    28 
    29 #html链接的标签是“a”,链接的属性是“href”,也就是要获得html中所有tag=a,attrs=href 值。
    30 class URLPaser(sgmllib.SGMLParser):
    31     def reset(self):
    32         sgmllib.SGMLParser.reset(self)
    33         self.urls = []
    34 
    35     def start_a(self,attrs):
    36         href = [v for k,v in attrs if k == 'href']
    37         if href:
    38             self.urls.extend(href)
    39 
    40 IParser = URLPaser()
    41 socket = urllib.urlopen("http://travel.sohu.com/lvyouxinwen.shtml")#打开这个网页
    42 
    43 #fout = file('qq_art_urls.txt','w')#要把这个链接写到这个文件中
    44 IParser.feed(socket.read())#分析啦
    45 
    46 reg = 'http://travel.sohu.com/2015.*'#这个是用来匹配符合条件的链接,使用正则表达式匹配
    47 
    48 pattern = re.compile(reg)
    49 i = 0
    50 url2 = []
    51 for url in IParser.urls:#链接都存在urls里
    52     if pattern.match(url):
    53         if url not in url2:
    54             url2.append(url)
    55             print url
    56             artical = func(url)
    57             print artical
    58             if len(artical)<>0:
    59                   i = i + 1
    60                   f = open("sougou/Travel/"+str(i) + '.txt','a+')
    61                   f.write(artical)
    62                   f.close()
  • 相关阅读:
    Elasticsearch 类比 mysql 实现 in and like or
    es 全文查询
    es 聚合查询
    es多字段分组并求数量
    es 多字段分组并求和
    es 滚动查询二
    es 滚动查询一
    java8 日期操作
    语录(心灵鸡汤来一波)
    并发处理-隔离级别
  • 原文地址:https://www.cnblogs.com/minmsy/p/4962745.html
Copyright © 2011-2022 走看看