zoukankan      html  css  js  c++  java
  • 多进程爬虫+正则猫眼TOP100

    import requests
    from requests.exceptions import RequestException
    import re
    import json
    from multiprocessing import Pool
    import time


    def get_one_page(url,headers):
    try:

    response = requests.get(url,headers=headers, verify=False)
    print(response.status_code)
    if response.status_code == 200:
    return response.text
    return None
    except RequestException:
    return "NOT NOW"

    def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(d*)</i>.*?data-src="(.*?)".*?name"><a'
    +'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
    +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    items = re.findall(pattern,html)
    for item in items:
    yield {
    'index':item[0],
    'image':item[1],
    'title':item[2],
    'actor':item[3].strip()[3:],
    'time':item[4].strip()[5:],
    'score':item[5]+item[6]
    }

    def write_to_file(content):
    with open("result.txt",'a',encoding="utf-8") as f:
    f.write(json.dumps(content,ensure_ascii=False)+' ')
    f.close()

    def main(offset):
    headers = {
    'User-Agent': 'XXXXXXX',
    'connection': 'keep - alive',
    'Cookie': 'XXXXXXXXXXXXXXXXXXXXXXXXX' }
    url = "https://maoyan.com/board/4?offset="+str(offset)
    html = get_one_page(url,headers)
    # print(html)
    # parse_one_page(html)
    for item in parse_one_page(html):
    print(item)
    write_to_file(item)

    if __name__ == "__main__":

    # 进程池pool.map/方法不好,因为多进程调度存在顺序问题,有问题也不报错
    # pool = Pool()
    # pool.map(main,[i*10 for i in range(10)])
    # 其实下面也会存在顺序问题,可以稍后排序
    start = time.time()
    p = Pool()
    for i in range(10): # CPU有几核,每次就取出几个进程
    p.apply_async(func=main, args=(i*10,))
    p.close() # 调用join()之前必须先调用close(),调用close()之后就不能继续添加新的Process了
    p.join() # 对Pool对象调用join()方法会等待所有子进程执行完毕
    end = time.time()
    print('多进程(非阻塞)执行共需时间为:%.2f' % (end - start))
  • 相关阅读:
    POJ 2485 Highways &amp;&amp; HDU1102(20/200)
    easyui 后台框架搭建
    启动第二个Activity
    Apache配置基于域名的虚拟主机
    POJ_1679_The Unique MST(次小生成树模板)
    MySQL学习笔记
    数据库学习之简单的SQL语句
    HDU-4643-GSM(DFS)
    Android Studio VS Eclipse (还在用Eclipse?你OUT了!)
    使用国内镜像源来加速python pypi包的安装
  • 原文地址:https://www.cnblogs.com/Knight66666/p/12572271.html
Copyright © 2011-2022 走看看