zoukankan      html  css  js  c++  java
  • requests+BeautifulSoup | 爬取电影天堂全站电影资源

    import requests
    import urllib.request as ur
    from bs4 import BeautifulSoup
    import csv
    import threading
    class MovieHeven():
        def __init__(self):
            self.url="https://www.dytt8.net/html/gndy/dyzz/index.html"
            self.page=1
            self.No=1
            self.fobj=open("movies.csv", "wt", encoding="gbk", newline='')
        def spider(self):
            try:
                print("正在爬取第{}页...".format(self.page))
                # time.sleep(1)
                #获取网页链接并读取
                html = requests.get(self.url)#.Session()
                html.encoding="gbk"
                html=html.text
                #beautfulSoup装载文档
                root=BeautifulSoup(html,"lxml")
                #查找所需元素,获取tables列表
                tables=root.find("div",attrs={"class":"co_content8"}).find("ul").find_all("table")
                for table in tables:
                    name = table.find("a").text
                    href = "http://www.dytt8.net"+table.find("a")["href"]
                    # 文件写入操作
                    writer = csv.writer(self.fobj)
                    writer.writerow([name, href])
                    print("No:", self.No, name, href)
                    self.No += 1
                # time.sleep(1)
                urls=root.find("div",attrs={"class":"co_content8"}).find("div",attrs={"class":"x"}).find_all("a")
                #寻找下一页的链接
                for u in urls:
                    if u.text == "下一页":#如有下一页
                        self.url="https://www.dytt8.net/html/gndy/dyzz/"+u["href"]
                        print(self.url)
                        self.page += 1
                        self.spider()#爬取下一页
    
    
            # except:#没有下一页
            #     print("finished")
                # spider(url)
            except Exception as err:
                print(err)
        def main(self):
        ##    threading.Thread(target=spiderA(url)).start()
            import time
            begin_time = time.time()
            self.spider()  # 执行主程序
            self.fobj.close()
            end_time = time.time()
            time = end_time - begin_time
            m, s = divmod(round(time), 60)
            print("用时:{}min{}s".format(m, s))
    
    if __name__ == '__main__':
        billie=MovieHeven()
        billie.main()
    

      

  • 相关阅读:
    [NHibernate]条件查询Criteria Query
    [JQuery]用InsertAfter实现图片走马灯展示效果
    [NHibernate]HQL查询
    [NHibernate]基本配置与测试
    [HTML/CSS]margin属性用法
    [HTML/CSS]盒子模型,块级元素和行内元素
    [Asp.net MVC]Asp.net MVC5系列——布局视图
    [c#基础]值类型和引用类型的Equals,==的区别
    用中间件实现读负载均衡的数据库群集
    论数据库连接池对中间件性能的重要性
  • 原文地址:https://www.cnblogs.com/billie52707/p/12113520.html
Copyright © 2011-2022 走看看