zoukankan      html  css  js  c++  java
  • python AjaxSpider 代码演示


    
    import re # 引入正则表达式
    import json #  引入 json
    import pymongo # 引入mongo数据库
    import requests # 引入HTTP请求协议
    from hashlib import md5 # 引入MD5
    from bs4 import BeautifulSoup #引入BeautifulSoup 信息查询框架
    from multiprocessing import Pool # 引入 多线程池
    from urllib.parse import urlencode #引入网页解析
    from json.decoder import JSONDecodeError #引入json错误异常
    from requests.exceptions import RequestException #引入 HTTP异常
    
    from config import * #导入数据库配置信息
    
    client = pymongo.MongoClient(MONGO_URL,connect=False)
    db = client[MONGO_DB]
    
    # 抓取索引
    def get_page_index(offset,keyword):
        # 构造请求数据信息
        data ={
            'office':offset, # 默认页码
            'format': 'json', # 数据格式
            'keyword': 'keyword', # 关键字
            'autoload': 'true',
            'count': '20',
            'cur_tab': 3,
        }
        url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
        try:
            response = requests.get(url)
            # 判断是否有正常获取到网页信息
            if response.status_code == 200:
                # 如果访问正常泽返回数据,否则为空
                return response.text
            return None
        except RequestException:
            print('请求索引出错')
            return None
    
    def parse_page_index(html):
       try:
            data = json.loads(html)
            if data and 'data' in data.keys():
                for item in data.get('data'):
                    yield item.get('article_url')
       except JSONDecodeError:
           pass
    
    def get_page_detail(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                    return response.text
            return None
        except RequestException:
            print('请求详情页出错',url)
            print(url)
    
    
    def parse_page_detail(html,url):
        soup = BeautifulSoup(html,'lxml')
        title = soup.select('title')[0].get_text()
        print(title)
        images_pattern = re.compile('var gallery = (.*?)',re.S)
        result = re.search(images_pattern,html)
        if result:
            data = json.loads(result.group(1))
            if data and 'sub_images' in data.keys():
                sub_images = data.get('sub_images')
                images = [item.get('url') for item in sub_images]
                for image in images: download_image(image)
                return {
                    'title':title,
                    'url':url,
                    'images':images,
    
                }
    
    def save_to_monogo(result):
        if db[MONGO_TABLE].insert(result):
            print('存储到MonogoDB成功',result)
            return True
        return False
    
    def download_image(url):
        print('正在下载',url)
        try:
            response = requests.get(url)
            if response.status_code == 200:
                    # return response.text
                save_image(response.content)
            return None
        except RequestException:
            print('请求图片出错出错',url)
            return None
    
    def save_image(content):
        file_path = '{0}/{1}.{2}'.format(ls.getcwd(),md5(content).hexdigest(),'jpg')
        if not os.path.exists(file_path):
            with open(file_path,'wb') as f:
                f.writable(content)
                f.close()
    
    
    def main(offset):
        # html = get_page_index(0,'街拍')
        html = get_page_index(offset,KEYWORD)
        for url in parse_page_index(html):
            html = get_page_detail(url)
            if html:
               result = parse_page_detail(html,url)
               if result: save_to_monogo(result)
    
               print(result)
    if __name__ == '__main__':
        # main()
        groups = [x*20 for x in range(GROUP_START,GROUP_END + 1)]
        pool = Pool()
        pool.map(main, groups)
    
    
    
  • 相关阅读:
    JS判断数组中是否有重复元素的方法
    根据类名找到元素
    JS获取页面元素并修改
    JS实现会动的小车
    动态规划------最长公共子串(连续)
    动态规划------最长递增子序列
    买卖股票的最佳时期
    操作系统清华大学版笔记(十一)死锁、银行家算法和进程间通信(直接通信、间接通信)
    128 最长连续序列
    链表------删除有序单链表中重复的节点
  • 原文地址:https://www.cnblogs.com/wordgao/p/9824675.html
Copyright © 2011-2022 走看看