zoukankan      html  css  js  c++  java
  • 爬取电影天堂上最新电影的下载链接的源码

    爬取电影天堂上最新电影的下载链接的源码

    import requests, sqlite3, time
    from os.path import exists
    from os import remove,system
    from bs4 import BeautifulSoup
    from time import strftime
    from threading import Thread
    from datetime import datetime
    
    def getmsg(url):
        movies = []
        res = requests.get(url)
        res.encoding='gbk'
        soup = BeautifulSoup(res.text, 'html.parser')
        movie_list = soup.find('div', class_='co_content8').findAll('table', class_='tbspan')
        for movie in movie_list:
            movie_time = movie.find('font', color='#8F8C89').get_text()[3:22]
            movie = movie.findAll('a')  # 有两个a标签,一定要用findAll,用find只会找到第一个a标签
            # print(movie)
            movie_name = movie[1].get_text()
            movie_url = "https://www.dytt8.net" + movie[1]['href']
            movies.append([movie_time, movie_name, movie_url])
            # print('{:<20s}{:<40s}{:<60s}'.format(movie_time, movie_name, movie_url))
        return movies
    
    def save_db(movie_time,movie_name,movie_url):
        dbname = '电影天堂_' + strftime('%y-%m-%d', time.localtime()) + '.sqlite'
        conn =sqlite3.connect(dbname)
        cursor = conn.cursor()
        cursor.execute("insert into dytt_movies values('%s','%s','%s')"%(movie_time,movie_name,movie_url))
        conn.commit()
        cursor.close()
        conn.close()
    
    def down_link_save(movie_time, movie_name, movie_url):
        try:
            down_link_res=requests.get(movie_url)
            down_link_res.encoding='gbk'
            down_link_soup = BeautifulSoup(down_link_res.text,'html.parser')
            down_link=down_link_soup.find('div',class_='co_content8').find('td',style="WORD-WRAP: break-word").find('a')['href']
            movie_url=down_link
            # print('{:<20s}{:<40s}{:<60s}'.format(movie_time, movie_name, movie_url))
            save_db(movie_time,movie_name,movie_url)
        except:
            print("{}获取链接失败".format(movie_name))
    
    def show_results(dbname):
        filename = '电影天堂_'+strftime('%y-%m-%d',time.localtime())+'.txt'
        if exists(filename):remove(filename)
        conn = sqlite3.connect(dbname)
        cursor =conn.cursor()
        cursor.execute("select * from dytt_movies order by movie_time desc ")
        results = cursor.fetchall()
        print("总共找到{}部电影!".format(len(results)))
        cursor.close()
        conn.close()
        i = 0
        for movie in results:
            i =i + 1
            with open(filename,'a',encoding='utf-8') as f:
                f.write("{:<5s}{:<30s}{:<40s}{:<40s}
    ".format(str(i),movie[0],movie[1],movie[2]))
    
        system(filename)
    
    # 主函数,函数入口
    if __name__ == '__main__':
        start = datetime.now()
        movies = []
        dbname = '电影天堂_' + strftime('%y-%m-%d', time.localtime()) + '.sqlite'
        if exists(dbname): remove(dbname)
        conn = sqlite3.connect(dbname)
        cursor = conn.cursor()
        cursor.execute("create table dytt_movies(movie_time varchar(40),movie_name varchar(40),movie_url varchar(60))")
        conn.commit()
        cursor.close()
        conn.close()
        for i in range(1, 21):
            url = 'https://www.dytt8.net/html/gndy/china/list_4_{}.html'.format(i)
            response = requests.get(url)
            response.encoding = 'gbk'
            print('collecting message from {:s}'.format(url))
            movies = getmsg(url)
            # for item in movies:
            #     print(item)
            # 多线程的好处,是一个线程执行不下去了,不影响其他线程。如果只有一个线程来抓一页的这25条,如果中间出错了,就执行不下去了
            threads=[]
            for item in movies:
                t = Thread(target=down_link_save,args=(item[0],item[1],item[2]))
                threads.append(t)
            for t in threads:
                t.start()
            for t in threads:
                t.join()
        run_time = (datetime.now()-start).total_seconds()
        print("共用时{}秒".format(run_time,end='	'))
        show_results(dbname)
    
    
  • 相关阅读:
    react native错误排查-TypeError: window.deltaUrlToBlobUrl is not a function
    react native报错处理com.android.build.api.transform.TransformException: com.android.builder.dexing.DexArchiveBuilderException: com.android.builder.dexing.DexArchiveBuilderException: Failed to process
    react native中一次错误排查 Error:Error: Duplicate resources
    umijs开发实践-不同页面交叉使用dva中的modal文件导致的错误
    每天五分钟-javascript数据类型
    react native中使用echarts
    微信小程序中通过腾讯地图进行逆地址解析报错message: "请求来源未被授权, 此次请求来源域名:servicewechat.com"
    在react中实现打印功能
    mac git从代码仓库克隆代码,修改并上传
    基于jwt的用户登录认证
  • 原文地址:https://www.cnblogs.com/yuexiao/p/12786856.html
Copyright © 2011-2022 走看看