通过简短的对python语法的熟悉,结合百度搜索的部分博主文章,拼写了个 抓取页面内容的demo
学习记录下!
from requests_html import HTMLSession
import requests
import pymysql.cursors
def getJsonText(url):
try:
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Connection':'keep-alive',
'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
'Cookie':'csrftoken=6ac95edd2e4866f1c5d2873d6295c5ce; tt_webid=6564523141883692558; uuid="w:1f2180a58a6048ab96b7dac4c8dbab81"; UM_distinctid=163dd0f3890548-04ef14e2aeee38-77256752-1fa400-163dd0f3891694; CNZZDATA1259612802=1435136700-1528418761-null%7C1531360188; tt_webid=6564523141883692558; WEATHER_CITY=%E5%8C%97%E4%BA%AC; _ga=GA1.2.1854637036.1528798614; login_flag=f5b3b0ab7f662248c014dc175aaab576; sessionid=1a2269ab6f9602fa1359cf507705e8b3; uid_tt=e5c3d73d536ad7d832d37328ce7ab08e; sid_tt=1a2269ab6f9602fa1359cf507705e8b3; sid_guard="1a2269ab6f9602fa1359cf507705e8b3|1530692681|15552000|Mon 54 31-Dec-2018 08:24:41 GMT"; __tasessionId=zj77s5mu41531359387030; _gid=GA1.2.973835841.1531360238',
}
r = requests.get(url, headers=headers)
json_str = r.json()
return json_str
except:
return '请求失败!'
def getHtml(url):
try:
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
}
if url is None:
return
session = HTMLSession()
r = session.get(url, headers=headers)
return r.html
except Exception as e:
return '抓取失败'
def jsonParser(url,html,path):
data = {}
postList = html['data']
for post in postList:
data = {
'source':post['source'],
'title':post['title'],
'source_url': post['source_url'],
'image_url':post['image_url'],
}
with open(path, 'a', encoding='utf-8') as f:
f.write(str(data))
f.write('
')
#HTML页面
def HtmlParser(url,html,path):
data = {}
postList = html.find('div.entlist')
for rs in postList:
data['title'] = rs.find('a', first=True).text
data['desc'] = rs.find('div.entinfonews > p',first=True).text
data['time'] = rs.find('div.time',first=True).text
detial_url = 'http://cbngold.com/' + rs.find('h2',first=True).find('a',first=True).attrs['href']
data['content'] = HtmlDetailedParser(detial_url)
#数据入库处理
connection = pymysql.connect(host='localhost',
user='root',
password='111111',
db='wikiurl',
charset='utf8mb4')
try:
with connection.cursor() as cursor:
sql = "insert into `urls`(`urlname`,`urlhref`,`content`) values(%s,%s,%s)"
cursor.execute(sql, (data['title'], detial_url, data['content']))
connection.commit()
finally:
connection.close()
#详细内容 url
def HtmlDetailedParser(url):
html = getHtml(url)
content = html.find('div#contentPanel',first=True).text
return content
#抓取html页面入口
def HtmlMain(url):
savepath = '/home/wwwroot/python_pro/text.txt'
html = getHtml(url)
HtmlParser(url, html, savepath)
#解析json接口
def JsonMain(url):
savepath = '/home/wwwroot/python_pro/toutiao.txt'
html = getJsonText(url)
jsonParser(url, html, savepath)
#入口
#HtmlMain('https://www.toutiao.com/ch/news_travel/')
#HtmlMain('https://www.toutiao.com/api/pc/feed/?category=news_finance&utm_source=toutiao&widen=1&max_behot_time=1531348824&max_behot_time_tmp=1531348824&tadrequire=true&as=A1358BD4F6AB71A&cp=5B46EB87D10ABE1&_signature=VhWM3gAADVR0cakAFkjT4lYVjM')
#JsonMain('https://www.toutiao.com/api/pc/feed/?category=news_finance&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A115CB74368C533&cp=5B46AC55C3C37E1&_signature=ReOiVAAAHqdnh4eKksi3R0Xjok')
HtmlMain('http://cbngold.com/newslist.aspx?id=25&p=0')
感谢网络提供的方便,特别是度娘 ~~