1 # coding:utf-8 2 # 3 # 把qq.ip138.com/train/上面的列车时刻表抓取解析出来,输出在命令行显示,并存入一个文件train_time.text 4 # 5 import requests 6 import time 7 from bs4 import BeautifulSoup 8 import random 9 10 BSLIB = 'html5lib' 11 BASE_URL = 'http://qq.ip138.com' 12 UA = ["Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0", "Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0", "Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0", "Mozilla/5.0 (X11; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0"] 13 14 15 def get_province(province, url,file): 16 print(province) 17 file.write("%s " % province) 18 HEADERS = {'user-agent': random.choice(UA)} 19 r = requests.get(url, headers=HEADERS) 20 s = BeautifulSoup(r.text.encode(r.encoding).decode('gbk'), BSLIB) 21 C = s.select('div > table > tbody > tr > td > a') 22 for c in C: # 每个城市 23 get_city(c.text, BASE_URL+c.get('href'),file) 24 time.sleep(random.random()*30) # 防止因访问频繁而被拒绝请求 25 26 27 def get_city(city, url,file): 28 print(' %s' % city) 29 file.write(" %s " % city) 30 HEADERS = {'user-agent': random.choice(UA)} 31 r = requests.get(url, headers=HEADERS) 32 s = BeautifulSoup(r.text.encode(r.encoding).decode('gbk'), BSLIB) 33 T = s.select('div#checilist > table > tbody > tr') 34 for t in T: # 每个车次 35 t_text = " " 36 tt = t.select('td') 37 for i in tt: # 每个车次的具体每个信息用 隔开 38 t_text += "%s " % i.text 39 print(t_text) 40 file.write('%s ' % t_text) 41 time.sleep(random.random()*4)# 防止因访问频繁而被拒绝请求 42 43 if __name__=='__main__': 44 out_file = open('train_time.txt', 'w') 45 url = BASE_URL+'/train/' 46 HEADERS = {'user-agent': random.choice(UA)} 47 r = requests.get(url, headers=HEADERS) 48 s = BeautifulSoup(r.text.encode(r.encoding).decode('gbk'), BSLIB) 49 P = s.select('table[width="600"] > tbody > tr > td > a') 50 for p in P: # 每个省份 51 get_province(p.text, BASE_URL+p.get('href'), out_file) 52 53 54 55 56 地区列车经过查询