zoukankan      html  css  js  c++  java
  • 自编码爬取今日头条街拍

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    """
        1.抓取索引页内容,利用requests请求目标站点,得到索引网页Htnl代码,返回结果
        2.抓取详情页内容,解析返回结果,得到详情页的链接,并进一步抓取详情页的信息
        3.下载图片与保存数据库,将图片下载到本地,并把页面信息及图片URL保存到MongDB
        4.开启循环及多线程,对多页内容遍历,开启多线程提高抓取速度
    """
    import json
    import os
    import re
    from multiprocessing import Pool
    from bs4 import BeautifulSoup
    import requests
    from urllib.parse import urlencode
    from requests.exceptions import RequestException
    import pymongo
    from config import *
    from hashlib import md5
    
    client = pymongo.MongoClient(MONGDB_URL)
    db = client.db[MONGO_DB]
    
    
    def request_url(url):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            print("请求索引页出错", url)
            return None
    
    
    def get_page_index(offset, keyword=KEYWORD):
        data = {
            'offset': offset,
            'format': 'json',
            'keyword': keyword,
            'autoload': 'true',
            'count': '20',
            'cur_tab': '1',
        }
        url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
        result = request_url(url)
        return result
    
    
    def parse_parse_index(html):
        data = json.loads(html)
        if data and 'data' in data.keys():
            for item in data.get("data"):
                yield item.get("article_url")
    
    
    def get_page_detail(url):
        result = request_url(url)
        return result
    
    
    def parse_page_detail(html, url):
        soup = BeautifulSoup(html, 'lxml')
        if soup.select('title'):
            title = soup.select('title')[0].get_text()
        else:
            title = None
        images_pattern = re.compile("var gallery = (.*?);", re.S)
        result = re.search(images_pattern, html)
        if result:
            data = json.loads(result.group(1))
            if data and "sub_images" in data.keys():
                sub_images = data.get('sub_images')
                images = [item.get("url") for item in sub_images]
                for image in images:
                    download_image(image)
                return {
                    "title": title,
                    "url": url,
                    "images": images,
                }
    
    
    def save_to_mongo(result):
        if db[MONGO_TABLE].insert(result):
            print("存储到MongoDB成功", result)
            return True
        return False
    
    
    def download_image(url):
        print("正在下载图片", url)
        result = request_url(url)
        return result
    
    
    def save_iamge(content):
        file_path = '{0}/toutiao/{1}{2}'.format(os.getcwd(), md5(content).hexdigest(), '.jpg')
        file_dir = os.path.join(os.getcwd(), "toutiao")
        if not os.path.exists(file_path):
            if not os.path.exists(file_dir):
                os.mkdir(file_dir)
            with open(file_path, 'wb') as f:
                f.write(content)
                f.close()
    
    
    def main(offset):
        html = get_page_index(offset, KEYWORD)
        for url in parse_parse_index(html):
            html = get_page_detail(url)
            if html:
                result = parse_page_detail(html, url)
                if result:
                    save_to_mongo(result)
    
    if __name__ == '__main__':
        groups = [x*20 for x in range(GROUP_START, GROUP_END + 1)]
        pool = Pool()
        pool.map(main, groups)
    

      

  • 相关阅读:
    JAVA中添加jar包
    shell 脚本读取数据库示例
    Div 布局之 DIV垂直居中显示
    awk 学习笔记
    提示ufmyhr日志已满,无法继续操作软件,如何解决
    12种貌似卫生的不卫生习惯
    远程通客户端反复提示要下载客户端软件
    固定资产反启用后再启用报00:00:00错误
    2008年5月14日
    睡前六个必要动作,一觉睡到大天亮
  • 原文地址:https://www.cnblogs.com/nixingguo/p/7262438.html
Copyright © 2011-2022 走看看