zoukankan      html  css  js  c++  java
  • 爬取IT之家业界新闻

    爬取站点  https://it.ithome.com/ityejie/ ,进入详情页提取内容。

      1 import requests
      2 import json
      3 from lxml import etree
      4 from pymongo import MongoClient
      5 
      6 url = 'https://it.ithome.com/ithome/getajaxdata.aspx'
      7 headers = {
      8     'authority': 'it.ithome.com',
      9     'method': 'POST',
     10     'path': '/ithome/getajaxdata.aspx',
     11     'scheme': 'https',
     12     'accept': 'text/html, */*; q=0.01',
     13     'accept-encoding': 'gzip, deflate, br',
     14     'accept-language': 'zh-CN,zh;q=0.9',
     15     'content-length': '40',
     16     'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
     17     'cookie': 'BAIDU_SSP_lcr=https://www.hao123.com/link/https/?key=http%3A%2F%2Fwww.ithome.com%2F&&monkey=m-kuzhan-group1&c=B329C2F33C91DEACCFAEB1680305F198; Hm_lvt_f2d5cbe611513efcf95b7f62b934c619=1530106766; ASP.NET_SessionId=tyxenfioljanx4xwsvz3s4t4; Hm_lvt_cfebe79b2c367c4b89b285f412bf9867=1530106547,1530115669; BEC=228f7aa5e3abfee5d059195ad34b4137|1530117889|1530109082; Hm_lpvt_f2d5cbe611513efcf95b7f62b934c619=1530273209; Hm_lpvt_cfebe79b2c367c4b89b285f412bf9867=1530273261',
     18     'origin': 'https://it.ithome.com',
     19     'referer': 'https://it.ithome.com/ityejie/',
     20     'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3472.3 Safari/537.36',
     21     'x-requested-with': 'XMLHttpRequest'
     22 }
     23 
     24 client = MongoClient()
     25 db = client['ithome']
     26 collection = db['ithome']
     27 max_page = 1000
     28 
     29 def get_page(page):  
     30 
     31     formData = {
     32         'categoryid': '31',
     33         'type': 'pccategorypage',
     34         'page': page,
     35         }
     36     try:
     37         r = requests.post(url, data=formData, headers=headers)
     38         if r.status_code == 200:
     39 
     40             #print(type(r))
     41             html = r.text
     42             # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
     43             text = etree.HTML(html)
     44             link_list = text.xpath('//h2/a/@href') 
     45 
     46             print("提取第"+str(page)+"页文章")
     47             id=0
     48             for link in link_list:
     49                 id+=1
     50                 print("解析第"+str(page)+"页第"+str(id)+"篇文章")
     51                 print("链接为:"+link)
     52                 loadpage(link)
     53 
     54     except requests.ConnectionError as e:
     55         print('Error', e.args)    
     56 
     57 
     58 # 取出每个文章的链接
     59 def loadpage(link):
     60 
     61     headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3472.3 Safari/537.36'}
     62 
     63     try:
     64  
     65         reseponse = requests.get(link, headers = headers)
     66         if reseponse.status_code == 200:     
     67             html = reseponse.text
     68             # 解析
     69             node = etree.HTML(html)
     70 
     71             ithome ={}
     72             # 取出每个标题,正文等
     73 
     74             # xpath返回的列表,这个列表就这一个参数,用索引方式取出来,标题
     75             ithome['title'] = node.xpath('//*[@id="wrapper"]/div[1]/div[2]/h1')[0].text
     76             # 时间
     77             ithome['data'] = node.xpath('//*[@id="pubtime_baidu"]')[0].text
     78             # 取出标签下的内容
     79             #content = node.xpath('//*[@id="paragraph"]/p/text()')
     80             ithome['content'] = "".join(node.xpath('//*[@id="paragraph"]/p/text()')).strip()
     81             #content = node.xpath('//*[@id="paragraph"]/p')[1].text
     82             # 取出标签里包含的内容,作者
     83             ithome['author'] = node.xpath('//*[@id="author_baidu"]/strong')[0].text
     84             # 评论数
     85             ithome['commentcount'] = node.xpath('//span[@id="commentcount"]')[0].text
     86             #评论数没有取到
     87             write_to_file(ithome)
     88             save_to_mongo(ithome)             
     89 
     90     except requests.ConnectionError as e:
     91         print('Error', e.args)  
     92 
     93 def write_to_file(content):
     94     with open('ithome.json','a',encoding='utf-8') as f:
     95         f.write(json.dumps(content,ensure_ascii=False)+'
    ')
     96         f.close()
     97 
     98 def save_to_mongo(result):
     99     if collection.insert(result):
    100         print('Saved to Mongo')
    101 
    102 if __name__ == '__main__':
    103     for page in range(1, max_page + 1):
    104         get_page(page)
    105 
    106                         
  • 相关阅读:
    介绍Collection框架的结构;Collection 和 Collections的区别
    Mybites和hibernate的优缺点和区别2
    Mybites和hibernate的优缺点和区别
    AJAX如何获取从前台传递过来的数据然后在通过servle传递给后台
    list map set 集合的区别
    乐观锁和悲观锁的区别
    python生产消费Kafka
    python类型转换
    python实现远程方法调用
    Scala常用数据结构
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9246369.html
Copyright © 2011-2022 走看看