zoukankan      html  css  js  c++  java
  • python requests包爬网页数据demo

    通过简短的对python语法的熟悉,结合百度搜索的部分博主文章,拼写了个 抓取页面内容的demo

    学习记录下!

    from requests_html import HTMLSession
    import requests
    import pymysql.cursors
    
    
    def getJsonText(url):
        try:      
            headers = {
                    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Connection':'keep-alive',
                    'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
                    'Cookie':'csrftoken=6ac95edd2e4866f1c5d2873d6295c5ce; tt_webid=6564523141883692558; uuid="w:1f2180a58a6048ab96b7dac4c8dbab81"; UM_distinctid=163dd0f3890548-04ef14e2aeee38-77256752-1fa400-163dd0f3891694; CNZZDATA1259612802=1435136700-1528418761-null%7C1531360188; tt_webid=6564523141883692558; WEATHER_CITY=%E5%8C%97%E4%BA%AC; _ga=GA1.2.1854637036.1528798614; login_flag=f5b3b0ab7f662248c014dc175aaab576; sessionid=1a2269ab6f9602fa1359cf507705e8b3; uid_tt=e5c3d73d536ad7d832d37328ce7ab08e; sid_tt=1a2269ab6f9602fa1359cf507705e8b3; sid_guard="1a2269ab6f9602fa1359cf507705e8b3|1530692681|15552000|Mon54 31-Dec-2018 08:24:41 GMT"; __tasessionId=zj77s5mu41531359387030; _gid=GA1.2.973835841.1531360238',                
                }
            
            r = requests.get(url, headers=headers)
            json_str = r.json()
            
            return json_str
        except:
            return '请求失败!'
        
    def getHtml(url):
        try:
            headers = {
                    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
                }
            if url is None:
                return
            session = HTMLSession()
            r = session.get(url, headers=headers)
            
            return r.html
        
        except Exception as e:
            return '抓取失败'
    
    def jsonParser(url,html,path):
        data = {}
        postList = html['data']
        for post in postList:
            data = {
                    'source':post['source'],
                    'title':post['title'],
                    'source_url': post['source_url'],
                    'image_url':post['image_url'],                
                }
            
            with open(path, 'a', encoding='utf-8') as f:
                f.write(str(data))
                f.write('
    ')
    
    #HTML页面
    def HtmlParser(url,html,path):
        data = {}
        postList = html.find('div.entlist')
        for rs in postList:
            
            data['title'] = rs.find('a', first=True).text
            data['desc'] = rs.find('div.entinfonews > p',first=True).text
            data['time'] = rs.find('div.time',first=True).text
            detial_url = 'http://cbngold.com/' + rs.find('h2',first=True).find('a',first=True).attrs['href']
            
            data['content'] = HtmlDetailedParser(detial_url)
            #数据入库处理
            connection = pymysql.connect(host='localhost',
                                         user='root',
                                         password='111111',
                                         db='wikiurl',
                                         charset='utf8mb4')
            
            try:
                
                with connection.cursor() as cursor:
                    sql = "insert into `urls`(`urlname`,`urlhref`,`content`) values(%s,%s,%s)"
                    cursor.execute(sql, (data['title'], detial_url, data['content']))
                    connection.commit()
            finally:
                connection.close()
        
    #详细内容 url
    def HtmlDetailedParser(url):
        html = getHtml(url)
        content = html.find('div#contentPanel',first=True).text
        return content
    
    #抓取html页面入口
    def HtmlMain(url):
        savepath = '/home/wwwroot/python_pro/text.txt'
        html = getHtml(url)
        HtmlParser(url, html, savepath)
    
    #解析json接口
    def JsonMain(url):
        savepath = '/home/wwwroot/python_pro/toutiao.txt'
        html = getJsonText(url)
        jsonParser(url, html, savepath)
        
    #入口
    #HtmlMain('https://www.toutiao.com/ch/news_travel/')
    #HtmlMain('https://www.toutiao.com/api/pc/feed/?category=news_finance&utm_source=toutiao&widen=1&max_behot_time=1531348824&max_behot_time_tmp=1531348824&tadrequire=true&as=A1358BD4F6AB71A&cp=5B46EB87D10ABE1&_signature=VhWM3gAADVR0cakAFkjT4lYVjM')
    
    #JsonMain('https://www.toutiao.com/api/pc/feed/?category=news_finance&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A115CB74368C533&cp=5B46AC55C3C37E1&_signature=ReOiVAAAHqdnh4eKksi3R0Xjok')
    
    
    HtmlMain('http://cbngold.com/newslist.aspx?id=25&p=0')
    

    感谢网络提供的方便,特别是度娘 ~~

  • 相关阅读:
    NPOIHelper.cs (NPOI 2.1.1)
    使用哈希加盐法来为密码加密【转】
    让普通控件拥有左键移动窗体的功能
    Reflector反编译.NET文件后修复【转】
    SD卡中FAT32文件格式快速入门(图文详细介绍)【转】
    项目管理知识体系指南(PMBOOK指南)(第5版) 阅读摘要
    数学
    位运算小结
    字符串(1)——Detect Capital
    数组和矩阵(3)——Next Greater Element I
  • 原文地址:https://www.cnblogs.com/murenhui/p/9300848.html
Copyright © 2011-2022 走看看