zoukankan html css js c++ java

python爬虫：搜狗微信公众号文章信息的采集（https://weixin.sogou.com/），保存csv文件

  1 import requests
  2 from requests.exceptions import RequestException
  3 from lxml import etree
  4 import csv
  5 import re
  6 import time
  7 from urllib import parse
  8 import time
  9 
 10 
 11 def get_page(url):
 12     """
 13         获取网页的源代码
 14     :param url:
 15     :return:
 16     """
 17     try:
 18         headers = {
 19             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
 20 
 21         }
 22         response = requests.get(url, headers=headers)
 23         if response.status_code == 200:
 24             return response.text
 25         return None
 26     except RequestException:
 27         return None
 28 
 29 def timeswitch(chuo):
 30 
 31     tupTime = time.localtime(chuo)  # 秒时间戳
 32     stadardTime = time.strftime("%Y-%m-%d %H:%M:%S", tupTime)
 33     return stadardTime
 34 
 35 def parse_page(text):
 36     """
 37         解析网页源代码
 38     :param text:
 39     :return:
 40     """
 41     html = etree.HTML(text)
 42 
 43     '''
 44     movie_name = html.xpath("//*[@id='sogou_vr_11002601_title_0']/text()[1]")
 45     actor = html.xpath("//p[@class='star']/text()")
 46     actor = list(map(lambda item: re.sub('s+', '', item), actor))
 47     time = html.xpath("//p[@class='releasetime']/text()")
 48     grade1 = html.xpath("//p[@class='score']/i[@class='integer']/text()")
 49     grade2 = html.xpath("//p[@class='score']/i[@class='fraction']/text()")
 50     new = [grade1[i] + grade2[i] for i in range(min(len(grade1), len(grade2)))]
 51     ranking = html.xpath("///dd/i/text()")
 52     return zip(ranking, movie_name, actor, time, new)
 53     '''
 54 
 55     biaotinew = list()
 56     biaoti = html.xpath("//div[@class='txt-box']/h3/a")
 57     for bt in biaoti:
 58         b = bt.xpath("string(.)")
 59         biaotinew.append(b)
 60     print(biaotinew)
 61 
 62     wangzhinew = list()
 63     base_url = 'https://weixin.sogou.com'
 64     wangzhi = html.xpath("//div[@class='txt-box']/h3//@href")
 65     for wz in wangzhi:
 66         w = "".join(list(base_url)+wangzhi)
 67         wangzhinew.append(w)
 68     print(wangzhinew)
 69 
 70     zhaiyaonew = list()
 71     zhaiyao = html.xpath("//p[@class='txt-info']")
 72     for bt in zhaiyao:
 73         b = bt.xpath("string(.)")
 74         zhaiyaonew.append(b)
 75     print(zhaiyaonew)
 76 
 77     gzh  = html.xpath("//a[@class='account']/text()")
 78     print(gzh)
 79 
 80     lastnew = list()
 81     shijiannew = list()
 82     shijian = html.xpath("//div[2]/div/span")
 83     for bt in shijian:
 84         b = bt.xpath("string(.)")
 85         shijiannew.append(b)
 86     for bp in shijiannew :
 87         newstr  = re.findall(r"d+.?d*",bp)
 88         # ['1.45', '5', '6.45', '8.82']
 89         lastor = ''.join(newstr)
 90         lastnew.append(timeswitch(int(lastor)))
 91     print(lastnew)
 92 
 93 
 94 
 95 
 96     return zip(biaotinew,wangzhinew,zhaiyaonew,gzh,lastnew)
 97 
 98 
 99 
100 
101 def change_page1(number):
102     """
103         翻页
104     :param number:
105     :return:
106     """
107     base_url ='https://weixin.sogou.com/weixin?oq=&query=python&_sug_type_=1&sut=0&lkt=0%2C0%2C0&s_from=input&ri=1&_sug_=n&type=2&sst0=1604564741184&page='
108     url = base_url +str(number)+'&ie=utf8&p=40040108&dp=1&w=01015002&dr=1'
109     return url
110 
111 
112 def save_to_csv(result, filename):
113     """
114         保存
115     :param result:
116     :param filename:
117     :return:
118     """
119     with open(filename, 'a',encoding='utf-8-sig',newline="") as csvfile:
120         writer = csv.writer(csvfile, dialect='excel')
121         writer.writerow(result)
122 
123 
124 def main():
125     """
126     主函数
127     :return:
128     """
129     f = open('message.csv', 'a+', encoding='utf-8-sig', newline="")  # newline取消空行
130     csv_writer = csv.writer(f)
131     csv_writer.writerow(["文章名称","文章链接地址","摘要","公众号名称","发布时间"])
132     f.close()
133 
134 
135 
136 
137     for number in range(1,6):
138         url = change_page1(number)
139         text = get_page(url)
140         result = parse_page(text)
141         for a in result:
142             save_to_csv(a, filename='message.csv')
143 
144 
145 if __name__ == '__main__':
146     main()

查看全文

相关阅读:
Java 集合 — ArrayList
Java 线程 — ScheduledThreadPoolExecutor
Java 线程 — ThreadPoolExecutor
Java 线程 — ThreadLocal
Java 线程 — ConcurrentLinkedQueue
Java 线程 — ConcurrentHashMap
Java 线程 — AbstractQueuedSynchronizer
Java 线程 — JMM Java内存模型
 Java 线程 — synchronized、volatile、锁
 spring源码 — 三、AOP代理生成

原文地址：https://www.cnblogs.com/Zoeun/p/13933662.html