import requests
import re
def get_url(url):
resp = requests.get(url,headers=headers)
html=resp.content
data=str(html,'gbk') #html_doc=html.decode("utf-8","ignore")
link_1="<li><a href='(.*?).html'>.*?</a></li>"
link=re.compile(link_1,re.S).findall(data)
return link
def get_response(link):
for i in range(0,len(link)):
url2='http://www.tianqihoubao.com'+link[i]+'/2018030'
for j in range(1,32):
if j <10:
url=url2+str(j)+'.html'
else:
url='http://www.tianqihoubao.com'+link[i]+'/201803'+str(j)+'.html'
resp3 = requests.get(url, headers=headers)
data3 = resp3.text
low1=' <td style="color:#E54600" ><b>(.*?)</b></td>'
high1='<td style="color:#000065"><b>(.*?)</b></td>'
city1='<meta name="Keywords" content="(.*?)" />'
city=re.compile(city1).findall(data3)
low=re.compile(low1).findall(data3)
high=re.compile(high1).findall(data3)
a = city[0]
b = low[0]
c = high[0]
print(a,b,c)
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url='http://www.tianqihoubao.com/weather/province.aspx?id=330000'
link=get_url(url)
get_response(link)