zoukankan      html  css  js  c++  java
  • 1)①爬取中国新闻网科技相关部分新闻

     1 __author__ = 'minmin'
     2 #coding:utf-8
     3 import re,urllib,sgmllib,os
     4 
     5 #根据当前的url获取html
     6 def getHtml(url):
     7     page = urllib.urlopen(url)
     8     html = page.read()
     9     page.close()
    10     return html
    11 
    12 #根据html获取想要的文章内容
    13 def func(str):
    14      result =   re.findall(r"<p.*?>([^<>]*)</p>",getHtml(url),re.M)
    15      artical =''
    16 
    17      for j in result:
    18          if len(j)<>0:
    19              j = j.replace("<strong>","    ")
    20              j = j.replace("</strong>","    ")
    21              j = j.replace("<br>","   ")
    22              j = j.replace("&nbsp;"," ")
    23              j = j.replace("&ldquo;"," ")
    24              j = j.replace("&rdquo;"," ")
    25              j = j.replace("&middot;"," ")
    26              artical = artical + j + '
    '
    27      return  artical
    28 
    29 #html链接的标签是“a”,链接的属性是“href”,也就是要获得html中所有tag=a,attrs=href 值。
    30 class URLPaser(sgmllib.SGMLParser):
    31     def reset(self):
    32         sgmllib.SGMLParser.reset(self)
    33         self.urls = []
    34 
    35     def start_a(self,attrs):
    36         href = [v for k,v in attrs if k == 'href']
    37         if href:
    38             self.urls.extend(href)
    39 
    40 IParser = URLPaser()
    41 socket = urllib.urlopen("http://economy.china.com/internet/")#打开这个网页
    42 
    43 #fout = file('qq_art_urls.txt','w')#要把这个链接写到这个文件中
    44 IParser.feed(socket.read())#分析啦
    45 
    46 reg = 'http://economy.china.com/internet/.*'#这个是用来匹配符合条件的链接,使用正则表达式匹配
    47 
    48 pattern = re.compile(reg)
    49 
    50 os.getcwd()#获得当前文件夹路径
    51 os.path.sep#当前系统路径分隔符
    52 
    53 #判断文件是否存在
    54 if os.path.exists('china_news_Technology')==False:
    55      os.makedirs('china_news_Technology')
    56 i = 0
    57 url2 = []
    58 for url in IParser.urls:#链接都存在urls里
    59     url = "http://economy.china.com"+url
    60     if pattern.match(url):
    61         if url not in url2:
    62             url2.append(url)
    63             url = url.replace(".html","_all.html#page_2")#由于一篇文章被分成几页,找到显示全页的那一个页面
    64             artical = func(url)
    65             if len(artical)<>0:
    66                   print artical
    67                   print url
    68                   i = i + 1
    69                   f = open("china_news_Technology/"+str(i) + '.txt','a+')
    70                   f.write(artical)
    71                   f.close()
  • 相关阅读:
    mac上python3安装HTMLTestRunner
    双目深度估计传统算法流程及OpenCV的编译注意事项
    深度学习梯度反向传播出现Nan值的原因归类
    1394. Find Lucky Integer in an Array
    1399. Count Largest Group
    1200. Minimum Absolute Difference
    999. Available Captures for Rook
    509. Fibonacci Number
    1160. Find Words That Can Be Formed by Characters
    1122. Relative Sort Array
  • 原文地址:https://www.cnblogs.com/minmsy/p/4962656.html
Copyright © 2011-2022 走看看