zoukankan      html  css  js  c++  java
  • python爬虫:搜狗微信公众号文章信息的采集(https://weixin.sogou.com/),保存csv文件

      1 import requests
      2 from requests.exceptions import RequestException
      3 from lxml import etree
      4 import csv
      5 import re
      6 import time
      7 from urllib import parse
      8 import time
      9 
     10 
     11 def get_page(url):
     12     """
     13         获取网页的源代码
     14     :param url:
     15     :return:
     16     """
     17     try:
     18         headers = {
     19             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
     20 
     21         }
     22         response = requests.get(url, headers=headers)
     23         if response.status_code == 200:
     24             return response.text
     25         return None
     26     except RequestException:
     27         return None
     28 
     29 def timeswitch(chuo):
     30 
     31     tupTime = time.localtime(chuo)  # 秒时间戳
     32     stadardTime = time.strftime("%Y-%m-%d %H:%M:%S", tupTime)
     33     return stadardTime
     34 
     35 def parse_page(text):
     36     """
     37         解析网页源代码
     38     :param text:
     39     :return:
     40     """
     41     html = etree.HTML(text)
     42 
     43     '''
     44     movie_name = html.xpath("//*[@id='sogou_vr_11002601_title_0']/text()[1]")
     45     actor = html.xpath("//p[@class='star']/text()")
     46     actor = list(map(lambda item: re.sub('s+', '', item), actor))
     47     time = html.xpath("//p[@class='releasetime']/text()")
     48     grade1 = html.xpath("//p[@class='score']/i[@class='integer']/text()")
     49     grade2 = html.xpath("//p[@class='score']/i[@class='fraction']/text()")
     50     new = [grade1[i] + grade2[i] for i in range(min(len(grade1), len(grade2)))]
     51     ranking = html.xpath("///dd/i/text()")
     52     return zip(ranking, movie_name, actor, time, new)
     53     '''
     54 
     55     biaotinew = list()
     56     biaoti = html.xpath("//div[@class='txt-box']/h3/a")
     57     for bt in biaoti:
     58         b = bt.xpath("string(.)")
     59         biaotinew.append(b)
     60     print(biaotinew)
     61 
     62     wangzhinew = list()
     63     base_url = 'https://weixin.sogou.com'
     64     wangzhi = html.xpath("//div[@class='txt-box']/h3//@href")
     65     for wz in wangzhi:
     66         w = "".join(list(base_url)+wangzhi)
     67         wangzhinew.append(w)
     68     print(wangzhinew)
     69 
     70     zhaiyaonew = list()
     71     zhaiyao = html.xpath("//p[@class='txt-info']")
     72     for bt in zhaiyao:
     73         b = bt.xpath("string(.)")
     74         zhaiyaonew.append(b)
     75     print(zhaiyaonew)
     76 
     77     gzh  = html.xpath("//a[@class='account']/text()")
     78     print(gzh)
     79 
     80     lastnew = list()
     81     shijiannew = list()
     82     shijian = html.xpath("//div[2]/div/span")
     83     for bt in shijian:
     84         b = bt.xpath("string(.)")
     85         shijiannew.append(b)
     86     for bp in shijiannew :
     87         newstr  = re.findall(r"d+.?d*",bp)
     88         # ['1.45', '5', '6.45', '8.82']
     89         lastor = ''.join(newstr)
     90         lastnew.append(timeswitch(int(lastor)))
     91     print(lastnew)
     92 
     93 
     94 
     95 
     96     return zip(biaotinew,wangzhinew,zhaiyaonew,gzh,lastnew)
     97 
     98 
     99 
    100 
    101 def change_page1(number):
    102     """
    103         翻页
    104     :param number:
    105     :return:
    106     """
    107     base_url ='https://weixin.sogou.com/weixin?oq=&query=python&_sug_type_=1&sut=0&lkt=0%2C0%2C0&s_from=input&ri=1&_sug_=n&type=2&sst0=1604564741184&page='
    108     url = base_url +str(number)+'&ie=utf8&p=40040108&dp=1&w=01015002&dr=1'
    109     return url
    110 
    111 
    112 def save_to_csv(result, filename):
    113     """
    114         保存
    115     :param result:
    116     :param filename:
    117     :return:
    118     """
    119     with open(filename, 'a',encoding='utf-8-sig',newline="") as csvfile:
    120         writer = csv.writer(csvfile, dialect='excel')
    121         writer.writerow(result)
    122 
    123 
    124 def main():
    125     """
    126     主函数
    127     :return:
    128     """
    129     f = open('message.csv', 'a+', encoding='utf-8-sig', newline="")  # newline取消空行
    130     csv_writer = csv.writer(f)
    131     csv_writer.writerow(["文章名称","文章链接地址","摘要","公众号名称","发布时间"])
    132     f.close()
    133 
    134 
    135 
    136 
    137     for number in range(1,6):
    138         url = change_page1(number)
    139         text = get_page(url)
    140         result = parse_page(text)
    141         for a in result:
    142             save_to_csv(a, filename='message.csv')
    143 
    144 
    145 if __name__ == '__main__':
    146     main()
  • 相关阅读:
    【用例篇】Xmind转为csv 导入禅道
    idea替换当前文件内容
    配置IDEA项目JDK环境
    git 只提交部分修改文件
    `总结TESTNG与JUNIT的异同
    POST请求BODY格式区别
    【转】使用AllureReport生成测试报告
    Springboot+Redis 配置和使用
    【转】git branch 命令查看分支、删除远程分支、本地分支
    [转]Json字符串和map和HashMap之间的转换
  • 原文地址:https://www.cnblogs.com/Zoeun/p/13933662.html
Copyright © 2011-2022 走看看