zoukankan      html  css  js  c++  java
  • requests+BeautifulSoup | 爬取电影天堂全站电影资源

    import requests
    import urllib.request as ur
    from bs4 import BeautifulSoup
    import csv
    import threading
    class MovieHeven():
        def __init__(self):
            self.url="https://www.dytt8.net/html/gndy/dyzz/index.html"
            self.page=1
            self.No=1
            self.fobj=open("movies.csv", "wt", encoding="gbk", newline='')
        def spider(self):
            try:
                print("正在爬取第{}页...".format(self.page))
                # time.sleep(1)
                #获取网页链接并读取
                html = requests.get(self.url)#.Session()
                html.encoding="gbk"
                html=html.text
                #beautfulSoup装载文档
                root=BeautifulSoup(html,"lxml")
                #查找所需元素,获取tables列表
                tables=root.find("div",attrs={"class":"co_content8"}).find("ul").find_all("table")
                for table in tables:
                    name = table.find("a").text
                    href = "http://www.dytt8.net"+table.find("a")["href"]
                    # 文件写入操作
                    writer = csv.writer(self.fobj)
                    writer.writerow([name, href])
                    print("No:", self.No, name, href)
                    self.No += 1
                # time.sleep(1)
                urls=root.find("div",attrs={"class":"co_content8"}).find("div",attrs={"class":"x"}).find_all("a")
                #寻找下一页的链接
                for u in urls:
                    if u.text == "下一页":#如有下一页
                        self.url="https://www.dytt8.net/html/gndy/dyzz/"+u["href"]
                        print(self.url)
                        self.page += 1
                        self.spider()#爬取下一页
    
    
            # except:#没有下一页
            #     print("finished")
                # spider(url)
            except Exception as err:
                print(err)
        def main(self):
        ##    threading.Thread(target=spiderA(url)).start()
            import time
            begin_time = time.time()
            self.spider()  # 执行主程序
            self.fobj.close()
            end_time = time.time()
            time = end_time - begin_time
            m, s = divmod(round(time), 60)
            print("用时:{}min{}s".format(m, s))
    
    if __name__ == '__main__':
        billie=MovieHeven()
        billie.main()
    

      

  • 相关阅读:
    庄家试盘的K线形态
    股票基本知识入门提纲
    我与猫
    夜雨不眠时
    快速排序
    由float转std::string的方法
    BugFree + EasyPHP在Windows平台搭建步骤详解
    安装VS2008的时候Windows Mobile 5.0 SDK R2 for pocket pc错误解决方案
    收集WCF文章
    linq to ef(相当于sql中in的用法)查询语句
  • 原文地址:https://www.cnblogs.com/billie52707/p/12113520.html
Copyright © 2011-2022 走看看