zoukankan html css js c++ java

假期学习【十一】Python爬取百度词条写入csv格式 python 2020.2.10

今天主要完成了根据爬取的txt文档，从百度分类从信息科学类爬取百度词条信息，并写入CSV格式文件。

txt格式文件如图：为自己爬取内容分词后的结果。

代码如下：

 1 import requests
 2 from bs4 import BeautifulSoup
 3 import csv
 4 import io
 5 import re
 6 
 7 url="https://baike.baidu.com/item/"
 8 id=1
 9 patton=re.compile(r'.*信息科学分类.*|.*软件.*|.*科技产品.*|.*公司.*|.*互联网人物.*|.*互联网.*|.*科技术语.*|.*技术.*|.*网站.*')
10 
11 #写入表头
12 def Head():
13     with open('E:/bdbk2.csv', 'w', encoding='utf-8', newline='') as csvfile:
14         writer = csv.writer(csvfile)
15         writer.writerow(["序号", "名称", "属性", "内容", "网址"])
16 
17 def read():
18     f = open('E:/bdbk2.csv', 'a+', encoding='utf-8', newline='')
19     csv_writer = csv.writer(f)
20     global url
21     f=open("E:/word5.txt",'r+',encoding="utf-8")
22     for line in f:
23         url=url+line.rstrip("
")
24         try:
25             global id, name, nature, content, tag
26             kv = {'user-agent': 'Mozilla/5.0'}
27             r = requests.get(url, headers=kv)
28             r.encoding = "utf-8"
29             demo = r.text
30             soup = BeautifulSoup(demo, "html.parser")
31             print(url)
32             # print(soup.prettify())
33             tag = soup.find_all("dd", {"id": "open-tag-item"})[0].get_text().replace("（", "").replace("）",
34                                                                                                       "").strip().replace(
35                 "
", "")
36             name = soup.find_all("h1")[0].get_text().strip()
37             nature = soup.find_all("h2")[0].get_text().replace("（", "").replace("）", "").strip()
38             if nature == '目录':
39                 nature = tag
40             content = soup.find_all("div", {"class": "lemma-summary"})[0].get_text().strip().rstrip("]").lstrip("[")
41             if name != "百度百科错误页" and nature != "目录" and len(patton.findall(tag)) != 0:
42                 print("序号:" + str(id))
43                 print("名称:" + name)
44                 print("属性:" + nature)
45                 print("内容:" + content)
46                 print("网址:" + url)
47                 csv_writer.writerow([str(id), name, nature, content, url])
48                 id += 1
49         except:
50             print("出错!")
51         url = "https://baike.baidu.com/item/"
52     f.close()
53 
54 if __name__=="__main__":
55     Head()
56     read()

查看全文

相关阅读:
项目管理
 开源视频会议bigbluebutton开发（1）——初始化安装以及配置
 oracle休系统结构
 Tomcat上安装配置Axis
锁表头
 文件复制三种方法
 程序员技术练级攻略
 Linux (RHEL 5.4)下安装 Oracle 10g R2
Android 学习资料收集汇总
 WAS61安装调整和应用部署.doc

原文地址：https://www.cnblogs.com/zlc364624/p/12292892.html