zoukankan      html  css  js  c++  java
  • python利用requests和threading模块,实现多线程爬取电影天堂最新电影信息。

      利用爬到的数据,基于Django搭建的一个最新电影信息网站:

       n1celll.xyz

      今天想利用所学知识来爬取电影天堂所有最新电影信息,用到的模块:

        requests:用于获取网页信息

        re:获取网页中具体想要的信息

        Beautifulsoup:便于标签搜索,获取想要信息

        threading:使用多线程大幅度缩短爬取时间

        queue:使用线程队列来写入文件(实际上我是把数据全部存入了数据库)

        大概差不多就这些模块。

    欢迎各位大牛指点。

    # Author : 'n1celll'
    import requests
    import json
    import re
    from bs4 import BeautifulSoup
    import threading
    import queue,time
    
    header = header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    
    url = 'http://www.ygdy8.net/html/gndy/dyzz/index.html'
    
    def get_page(url):
        index = requests.get(url, headers=header)
        index.enconding = 'GBK'# 将编码转为与HTML一致
        t = index.text
        index_soup = BeautifulSoup(t, 'html.parser')# 将获得的网页信息 转成soup对象
        all_pages = index_soup.find('select', attrs={'name': 'sldd'}).find_all('option')[-1] # 获得总页数
        page = int(all_pages.string)
        return page
    
    def get_data(page):
    
        page_url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_%s.html' % (page)  # 获取每一页数据
        print(page)
        # page_url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_30.html'
        res = requests.get(page_url, headers=header)
        res.encoding = 'GBK'  # 'gb2312'
        a = res.text
        soup = BeautifulSoup(a, 'html.parser')
        name = soup.find_all('a', attrs={'class': 'ulink'})
        # print(name)测试
        for i in name:
            try:
                moive_name = re.search('《(.*?)(》|】)', i.string).group()
                # 有两个坑,这个里面有个电影名字不是用的书名号,还有一个电影有两个a标签
            except:
                continue
            html = 'http://www.ygdy8.net' + i['href']
            da = requests.get(html, headers=header)
            da.encoding = 'GBK'  # da.apparent_encoding
            db = da.text
            # f = open('test2.txt','w',encoding='utf8')
            # f.write(a.text)
            # f.close()
            dr = BeautifulSoup(db, 'html.parser')
            span = dr.find('span', attrs={'style': 'FONT-SIZE: 12px'})
            if span:
                dc = span.text.split()
    
                data = ''
                for i in dc:
                    data += i
                print(data)
                msg = {}
                if data:
                    msg['mname'] = moive_name
                    try:
                        show_t = re.search(r'(?<=(◎年代|◎时间|品年代|年代】|播时间|播】:))(.*?)(?=◎|年|【)', data).group()
                    except:
                        show_t = re.search(r'(?<=日期|份:)(.*?)(?=(-|剧))', data).group()
                    msg['mtime'] = show_t
                    try:
                        country = re.search(r'(?<=(◎国家|◎产地|◎地区|◎国别|国家】))(.*?)(?=◎|【类)', data).group()
                    except:
                        try:
                            country = re.search(r'(?<=地区)(.*?)(?=语言)', data).group()
                        except:
                            country = '未知'
                    msg['mcountry'] = country
                    try:
                        time = re.search(r'(?<=◎片长|长度】)(.*?)(?=◎|【)', data).group()
                    except:
                        time = '未知'
                    msg['mtime'] = time
                    try:
                        mtype = re.search(
                            r'(?<=(◎类别|别类型|影类型|◎类型|集类型|◎分类|类型:|类别】|片类型|型】:))(.*?)(?=(◎|级别|【出品|【主演))', 
                            data).group()
                    except:
                        try:
                            mtype = re.search(r'(?<=类型:)(.*?)(?=国)', data).group()
                        except:
                            mtype = re.search(r'动作|爱情|战争', data).group()
              #以上的正则表达式,感觉用的很笨拙,希望有技术大牛提点建议
                    # with open('test4.txt','a+',encoding='utf8') as f:测试
                    #     f.write('%s: %s,%s,%s,%s
    ' % (moive_name, country, mtype, time, show_t))测试
                    q.put('%s: %s,%s,%s,%s,%s
    ' % (moive_name, country, mtype, time, show_t,html))
    
    q = queue.Queue(maxsize=10000)
    t_obj = []
    lock = threading.Lock()#加上线程锁
    # semaphore = threading.BoundedSemaphore(200)
    def writing(f):
        # semaphore.acquire()
        data = q.get()
        lock.acquire()
        f.write(data)
        lock.release()
        # semaphore.release()
            # if not q.get():
            #     f.close()
            #     break
        # print('写入完成')
    all_page = get_page(url)
    f = open('test4.txt', 'w', encoding='utf8')
    print(all_page+1)
    
    for i in range(1,all_page+1):
        t = threading.Thread(target=get_data,args=(i,))
        t.start()
        t_obj.append(t)
    for t in t_obj:
        t.join()#保证所有线程结束后开始写入
        print('%s over'%t)
    
    while q.qsize():#判断队列里面是否还有元素
        w = threading.Thread(target=writing, args=(f,))
        w.start()
        w.join()
    else:
        print('写入完成')
    

      

        

      

  • 相关阅读:
    Map Wiki -- proposed by Shuo Ren
    Smart Disk -- proposed by Liyuan Liu
    ubuntu 16.04下如何打造 sublime python编程环境
    manjaro linux没有ll等命令的解决办法
    python学习-命名规则
    python-unitetest-unittest 的几种执行方式
    python-pytest学习(一)- 简介和环境准备
    Python+request+unittest学习(一)- 读取文本出现 锘 * 系列乱码错误(UTF-8 BOM问题)的原因及解决方法
    Python+Selenium框架版(十)- unittest执行方法之discover()方法
    Python+Selenium框架版(九)- unittest执行法之makeSuit()
  • 原文地址:https://www.cnblogs.com/n1celll/p/10628078.html
Copyright © 2011-2022 走看看