zoukankan      html  css  js  c++  java
  • 猫眼电影爬取

    import requests
    from lxml import etree
    import csv
    import queue
    import threading
    from fake_useragent import UserAgent
    
    ua = UserAgent().random
    headers = {
        'User-Agent': ua
    }
    
    q_html = queue.Queue()
    q_big = queue.Queue()
    
    s = requests.session()
    s.get("https://maoyan.com/", headers=headers)
    
    
    # 发送请求获取HTML
    def request_get(url):
        # s = requests.session()
        # s.get("https://maoyan.com/", headers=headers)
        # s.get("https://maoyan.com/films", headers=headers)
        res = s.get(url, headers=headers)
        # print(res.text)
        html = etree.HTML(text=res.text)
        return html
    
    
    # 将HTML添加到队列
    def get_video_list():
        while 1:
            if q_big.empty() != True:
                url = q_big.get()
                html = request_get(url)
                video_ulr_list = html.xpath('//div[@class="movies-list"]/dl//dd')
                print("video_url_list-->{}".format(video_ulr_list))
                for i in video_ulr_list:
                    print("0.0")
                    url_less = i.xpath('./div[2]/a/@href')[0]
                    url = "https://maoyan.com" + url_less
                    q_html.put(url)
                    print("put的url:{}".format(url))
    
    
    def get_video_detail():
        print("==============")
        while 1:
            if q_html.empty() != True:
                video_ulr = q_html.get()
                print("get到的url:{}".format(video_ulr))
                info = request_get(video_ulr)
                try:
                    title = info.xpath('//div[@class="celeInfo-right clearfix"]/div/h1/text()')[0]
                    pub_time = info.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[3]/text()')[0]
                    star = info.xpath('//*[@id="app"]/div/div[1]/div/div[3]/div[1]/div[2]/div[2]/div/div[1]/ul/li/div/a/text()')[0]
                    star = star.strip()
                    star = star.replace(",", "")
                    print(title, pub_time, star)
                except IndexError:
                    pass
                fp = open('maoyan.csv', 'a', encoding='utf-8')  # newline 换行符
                writer = csv.writer(fp)
                writer.writerow([title, pub_time, star])
                fp.close()
    
    
    def main():
        fp = open('maoyan.csv', 'a', encoding='utf-8')  # newline 换行符
        writer = csv.writer(fp)
        writer.writerow(['title', 'pub_time', 'star'])
        fp.close()
        # 构造url请求
        urls = ['https://maoyan.com/films?showType=3&offset={}'.format(str(i)) for i in range(0, 500, 30)]
        print(urls)
        t_list = []
        for url in urls:
            q_big.put(url)
        t1 = threading.Thread(target=get_video_list)  # 获取电影的url列表
        t_list.append(t1)
    
        t2 = threading.Thread(target=get_video_detail)  # 获取url详情页
        t_list.append(t2)
    
        for i in t_list:
            i.start()
        for i in t_list:
            i.join()
    
    
    if __name__ == '__main__':
        main()
    

  • 相关阅读:
    浅读《构建之法》随笔
    个人学期总结
    201571030305/201571030306《小学生四则运算需求分析结对报告》
    小学生四则运算结对项目
    小学生四则运算
    读《构建之法》提出的问题
    个人学期总结
    实验四 小学生四则运算需求分析结对报告
    四则运算结对项目
    四则运算 201571030317
  • 原文地址:https://www.cnblogs.com/yzg-14/p/12199458.html
Copyright © 2011-2022 走看看