通过简短的对python语法的熟悉,结合百度搜索的部分博主文章,拼写了个 抓取页面内容的demo
学习记录下!
from requests_html import HTMLSession import requests import pymysql.cursors def getJsonText(url): try: headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection':'keep-alive', 'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0', 'Cookie':'csrftoken=6ac95edd2e4866f1c5d2873d6295c5ce; tt_webid=6564523141883692558; uuid="w:1f2180a58a6048ab96b7dac4c8dbab81"; UM_distinctid=163dd0f3890548-04ef14e2aeee38-77256752-1fa400-163dd0f3891694; CNZZDATA1259612802=1435136700-1528418761-null%7C1531360188; tt_webid=6564523141883692558; WEATHER_CITY=%E5%8C%97%E4%BA%AC; _ga=GA1.2.1854637036.1528798614; login_flag=f5b3b0ab7f662248c014dc175aaab576; sessionid=1a2269ab6f9602fa1359cf507705e8b3; uid_tt=e5c3d73d536ad7d832d37328ce7ab08e; sid_tt=1a2269ab6f9602fa1359cf507705e8b3; sid_guard="1a2269ab6f9602fa1359cf507705e8b3|1530692681|15552000|Mon 54 31-Dec-2018 08:24:41 GMT"; __tasessionId=zj77s5mu41531359387030; _gid=GA1.2.973835841.1531360238', } r = requests.get(url, headers=headers) json_str = r.json() return json_str except: return '请求失败!' def getHtml(url): try: headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0', } if url is None: return session = HTMLSession() r = session.get(url, headers=headers) return r.html except Exception as e: return '抓取失败' def jsonParser(url,html,path): data = {} postList = html['data'] for post in postList: data = { 'source':post['source'], 'title':post['title'], 'source_url': post['source_url'], 'image_url':post['image_url'], } with open(path, 'a', encoding='utf-8') as f: f.write(str(data)) f.write(' ') #HTML页面 def HtmlParser(url,html,path): data = {} postList = html.find('div.entlist') for rs in postList: data['title'] = rs.find('a', first=True).text data['desc'] = rs.find('div.entinfonews > p',first=True).text data['time'] = rs.find('div.time',first=True).text detial_url = 'http://cbngold.com/' + rs.find('h2',first=True).find('a',first=True).attrs['href'] data['content'] = HtmlDetailedParser(detial_url) #数据入库处理 connection = pymysql.connect(host='localhost', user='root', password='111111', db='wikiurl', charset='utf8mb4') try: with connection.cursor() as cursor: sql = "insert into `urls`(`urlname`,`urlhref`,`content`) values(%s,%s,%s)" cursor.execute(sql, (data['title'], detial_url, data['content'])) connection.commit() finally: connection.close() #详细内容 url def HtmlDetailedParser(url): html = getHtml(url) content = html.find('div#contentPanel',first=True).text return content #抓取html页面入口 def HtmlMain(url): savepath = '/home/wwwroot/python_pro/text.txt' html = getHtml(url) HtmlParser(url, html, savepath) #解析json接口 def JsonMain(url): savepath = '/home/wwwroot/python_pro/toutiao.txt' html = getJsonText(url) jsonParser(url, html, savepath) #入口 #HtmlMain('https://www.toutiao.com/ch/news_travel/') #HtmlMain('https://www.toutiao.com/api/pc/feed/?category=news_finance&utm_source=toutiao&widen=1&max_behot_time=1531348824&max_behot_time_tmp=1531348824&tadrequire=true&as=A1358BD4F6AB71A&cp=5B46EB87D10ABE1&_signature=VhWM3gAADVR0cakAFkjT4lYVjM') #JsonMain('https://www.toutiao.com/api/pc/feed/?category=news_finance&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A115CB74368C533&cp=5B46AC55C3C37E1&_signature=ReOiVAAAHqdnh4eKksi3R0Xjok') HtmlMain('http://cbngold.com/newslist.aspx?id=25&p=0')
感谢网络提供的方便,特别是度娘 ~~