zoukankan html css js c++ java

python3实践-从网站获取数据(Carbon Market Data-BJ) （pandas，bs4）

自己边看边实践一些简单的实际应用，下面的程序是从某个网站上获取需要的数据。

在编写的过程中，通过学习陆续了解到一些方法，发现Python真的是很便捷。

尤其是用pandas获取网页中的表格数据，真的是太方便了！！！

程序写的可能并不好，但基本上实现了自己的需求。

希望有高手来指点下~~

Version 04 （Jan 12 2017）【对于获取表格信息，推荐使用该方法】

 1 # Code based on Python 3.x
 2 # _*_ coding: utf-8 _*_
 3 # __Author: "LEMON"
 4 
 5 import pandas as pd
 6 
 7 url2 = 'http://www.bjets.com.cn/article/jyxx/?'
 8 links = []
 9 for n in range(2, 40):
10     # 页面总数为39页，需要自己先从网页判断，也可以从页面抓取，后续可以完善
11     link = url2 + str(n)
12     links.append(link)
13 links.insert(0, url2)
14 
15 df2 = pd.DataFrame()  # creates a new dataframe that's empty
16 for url in links:
17     # 利用pandas获取数据，需要安装 html5lib模块
18     dfs = pd.read_html(url, header=0)
19     for df in dfs:
20         df2= df2.append(df, ignore_index= True)
21 
22 # df2.to_excel('MktDataBJ.xlsx') # 将数据存储在excel文件里
23 df2.to_csv('MktDataBJ-1.csv')  # 将数据存储在csv文件里

Version 03 （Jan 12 2017）

 1 # Code based on Python 3.x
 2 # _*_ coding: utf-8 _*_
 3 # __Author: "LEMON"
 4 
 5 from bs4 import BeautifulSoup
 6 import requests
 7 import csv
 8 
 9 url2 = 'http://www.bjets.com.cn/article/jyxx/?'
10 links = []
11 for n in range(2, 40):
12     # 页面总数为39页，需要自己先从网页判断，也可以从页面抓取，后续可以完善
13     link = url2 + str(n)
14     links.append(link)
15 links.insert(0, url2)
16 
17 for url in links:
18     rep = requests.get(url)
19     # content = rep.text.encode(rep.encoding).decode('utf-8')
20     # # 直接用requests时，中文内容需要转码
21 
22     soup = BeautifulSoup(rep.content, 'html.parser')
23 
24     # table = soup.table
25     table = soup.find('table')  # 两种方式都可以
26 
27     trs = table.find_all('tr')
28     trs2 = trs[1:len(trs)]
29     list1 = []
30     for tr in trs2:
31         td = tr.find_all('td')
32         row = [i.text for i in td]
33         list1.append(row)
34 
35     with open('MktDataBJ.csv', 'a', errors='ignore', newline='') as f:
36         f_csv = csv.writer(f)
37         f_csv.writerows(list1)

Version 02 （Jan 09 2017）

 1 # Code based on Python 3.x
 2 # _*_ coding: utf-8 _*_
 3 # __Author: "LEMON"
 4 
 5 from bs4 import BeautifulSoup
 6 import requests
 7 import csv
 8 
 9 url2 = 'http://www.bjets.com.cn/article/jyxx/?'
10 links = []
11 for n in range(2, 40):
12     # 页面总数为39页，需要自己先从网页判断，也可以从页面抓取，后续可以完善
13     link = url2 + str(n)
14     links.append(link)
15 links.insert(0, url2)
16 # print(links)
17 
18 for url in links:
19     rep = requests.get(url)
20     # content = rep.text.encode(rep.encoding).decode('utf-8')
21     # # 直接用requests时，中文内容需要转码
22 
23     soup = BeautifulSoup(rep.content, 'html.parser')
24     body = soup.body
25     data = body.find('div', {'class': 'list_right'})
26 
27     quotes = data.find_all('tr')
28     quotes1 = quotes[1:len(quotes)]
29 
30     list1 = []
31     for x in quotes1:
32         list2 = []
33         for y in x.find_all('td'):
34             list2.append(y.text)  # 每日的数据做一个单独的list
35         list1.append(list2)
36     # print(list1)  # list1为每日数据的总列表
37     with open('MktDataBJ.csv', 'a', errors='ignore', newline='') as f:
38         f_csv = csv.writer(f)
39         f_csv.writerows(list1)

Version 01 （Jan 08 2017）

 1 # Code based on Python 3.x
 2 # _*_ coding: utf-8 _*_
 3 # __Author: "LEMON"
 4 
 5 from bs4 import BeautifulSoup
 6 import requests
 7 import csv
 8 
 9 urllink = 'http://www.bjets.com.cn/article/jyxx/?'
10 links = []
11 for n in range(2, 40):
12     #页面总数为39页，需要自己先从网页判断，也可以从页面抓取，后续可以完善
13     link = urllink + str(n)
14     links.append(link)
15 links.insert(0, urllink)
16 # print(links)
17 
18 for url in links:
19 
20     rep = requests.get(url)
21     # content = rep.text.encode(rep.encoding).decode('utf-8')
22     # # 直接用requests时，中文内容需要转码
23 
24     soup = BeautifulSoup(rep.content, 'html.parser')
25 
26     # print(soup.prettify())
27     # # prettify()
28 
29     body = soup.body
30     data = body.find('div', {'class': 'list_right'})
31 
32     # table title
33     titles = data.find_all('th')
34 
35     title = []
36     for x in titles:
37         title.append(x.text)
38     # print(title)
39 
40     quotes = data.find_all('tr')
41     quotes1 = quotes[1:len(quotes)]
42     # print(quotes1)
43 
44     list1 = []
45     for x in quotes1:
46         for y in x.find_all('td'):
47             list1.append(y.text)
48     # print(list1)  # list为每日数据的总列表
49 
50     date = []
51     volumes = []
52     meanprice = []
53     totalmoney = []
54 
55     for i in range(0, len(list1)):
56         if i % 4 == 0:
57             date.append(list1[i])
58         elif i % 4 == 1:
59             volumes.append(list1[i])
60         elif i % 4 == 2:
61             meanprice.append(list1[i])
62         else:
63             totalmoney.append(list1[i])
64 
65     # print(date)
66     # print(volumes)
67     # print(meanprice)
68     # print(totalmoney)
69 
70     final = []
71     for i in range(0, len(date)):
72         temp = [date[i], volumes[i], meanprice[i], totalmoney[i]]
73         final.append(temp)
74     # print(final)
75     with open('bj_carbon.csv', 'a', errors='ignore', newline='') as f:
76         f_csv = csv.writer(f)
77         f_csv.writerows(final)

查看全文

相关阅读:
git rebase 的使用
 一分钟带你学会利用mybatis-generator自动生成代码！
手把手带你实战下Spring的七种事务传播行为
 SpringBoot系列-整合Mybatis（注解方式）
SpringBoot系列-整合Mybatis（XML配置方式）
Java中打印日志，这4点很重要！
SpringBoot集成JWT实现权限认证
 一分钟带你了解JWT认证！
利用SpringBoot+Logback手写一个简单的链路追踪
 SpringBoot中如何优雅的读取yml配置文件？

原文地址：https://www.cnblogs.com/lemonbit/p/6262977.html