爬虫数据存储csv
一,csv文件的简单读写
import csv
# CSV也叫逗号分分隔,一般以逗号分隔,也可以使用空格或者tab分隔
csv_file = open("file/test.csv", "w+")
# 写入
try:
csv_write = csv.writer(csv_file)
csv_write.writerow(("col1", "col2", "col3"))
for i in range(4):
csv_write.writerow((i*1, i*10, i*100))
finally:
csv_file.close()
# 读出,当然上面写入也可以使用with as语法
with open("file/test.csv", "r") as f:
reader = csv.reader(f)
for i in reader:
print(i)
二, 爬虫实战,爬取世界大学排名存入csv中
import requests
from bs4 import BeautifulSoup
import csv
"""
https://www.dxsbb.com/news/16131.html
获取本网站上的世界大学1000名。因为时表格,存入csv文件
"""
def wirte_csv(data_list):
# 之前为gb2312编码,csv写入部分字符无法表示,所以转换为utf-8编码
csv_file = open("file/university.csv", "w+", encoding="utf-8")
try:
csv_writer = csv.writer(csv_file)
for i in data_list:
# 因为之前使用了 来拼接,所以这次使用' '来就行分割成为列表
csv_writer.writerow(i.split(' ')[:-1])
finally:
csv_file.close()
def get_soup(url, encode):
# 获取soup对象
header = {
'User-Agent': 'python-requests/2.22.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'
}
resp = requests.get(url=url, headers=header)
resp.encoding=encode
return BeautifulSoup(resp.text, "lxml")
def get_university(soup):
# 获取数据
un_list = soup.find("table").find_all("tr")
td_list = []
for un_row in un_list:
td_row=""
for un_td in un_row.find_all("td"):
td_row = td_row+un_td.text+" "
td_list.append(td_row)
return td_list
if __name__ == '__main__':
url = r"https://www.dxsbb.com/news/16131.html"
# 获取bs对象,并且此网站使用gb2312编码
soup = get_soup(url,"gb2312")
list = get_university(soup)
wirte_csv(list)
结果:生成文件university.csv,打开即可看到将爬取的数据。