zoukankan      html  css  js  c++  java
  • 基于Python的爬虫spider(爬取番号站)

    前几天咕咕了几天,最近又有了新的研究成果,爬取番号站,请忽略内容这只是学习☺️

      1 import requests
      2 import re
      3 import ast
      4 
      5 from urllib import parse
      6 from scrapy import Selector
      7 from MyProjects.test_model import *
      8 
      9 
     10 domin = "https://www.9fh.org"
     11 
     12 # def get_first_page():
     13 #     #获取第一页妹子的所有url
     14 #     pages = requests.get("https://www.9fh.org").text
     15 #     sel = Selector(text=pages)
     16 #     first_p_url = sel.xpath("//ul[@class='nav navbar-nav navbar-right']/li[2]/a/@href").extract()
     17 #     first_url = parse.urljoin(domin,first_p_url[0])
     18 #     return first_url
     19 
     20 all_urls = []
     21 
     22 def get_nodes_list():
     23     #获取第一页中所有妹子的url
     24     pages = requests.get("https://www.9fh.org/special-show-p-1.html").text
     25     sel = Selector(text=pages)
     26     p1_girls_url = sel.xpath("//div[@class='row placeholders']/div/h4/a/@href").extract()
     27     #获取第一页中的所有妹子
     28     url_list = []
     29     for te in p1_girls_url:
     30         url_list.append(parse.urljoin(domin, te))
     31     # all_urls = []
     32     # for ar in p1_girls_url:
     33     #     all_urls.append(parse.urljoin(domin, ar))
     34     # for tr in all_urls:
     35     #     if tr not in url_list:
     36     #         url_list.append(tr)
     37     #
     38     # next_page = sel.xpath("//ul[@class='pagination']/a[7]/@href").extract()
     39     # if next_page:
     40     #     next_url = parse.urljoin(domin, next_page[0])
     41     #     get_nodes_list(next_url)
     42 
     43     return url_list
     44 
     45 def get_all_url(url):
     46     #url为妹子的首页链接
     47     #得到所有url
     48     pages = requests.get(url).text
     49     sel = Selector(text=pages)
     50     urls = sel.xpath("//table[@class='table table-striped']/tbody/tr/td[2]/a/@href").extract()
     51     #urls为妹子首页的作品链接/
     52     work_url = []
     53     all_url = []
     54     for tp in urls:
     55         all_url.append(parse.urljoin(domin, tp))
     56     # print(all_url)
     57     # a = 1
     58     # next_page = sel.xpath("//ul[@class='pagination']/a[4]/@href").extract()
     59     # if next_page:
     60     #     next_url = parse.urljoin(domin, next_page[0])
     61     #     get_all_url(next_url)
     62     # last_urls = []
     63     # for url in all_urls:
     64     #     if p1_girls_url not in all_urls:
     65     #         last_urls.append(p1_girls_url)
     66     return all_url
     67 
     68 def demo():
     69     all_urls = []
     70     url_list = get_nodes_list()
     71     for url in url_list:
     72         all_urls.append(get_all_url(url))
     73     return all_urls
     74 
     75 def get_info(last_urls):
     76     for single_url in last_urls:
     77        for i in single_url:
     78             pages = requests.get(i).text
     79             sel = Selector(text=pages)
     80             # barsize = sel.xpath("//div[@class='col-xs-6 col-md-10 info']/p[5]/text()").extract()
     81             # work_name = sel.xpath("//table[@class='table table-striped']/tbody/tr/td[2]/a/text()").extract()
     82             name = sel.xpath("//div[@class='row']/div[1]/h2[1]/a/text()").extract()[0].encode('ISO-8859-1').decode('utf8')
     83             fanhao = sel.xpath("//div[@class='info']/p[1]/span[2]/text()").extract()[0].encode('ISO-8859-1').decode('utf8')
     84             launch_time = sel.xpath("//div[@class='info']/p[2]/text()").extract()[0]
     85             varieties = sel.xpath("//div[@class='info']/p[6]/span/a/text()").extract()
     86             types = ','.join(varieties).encode('ISO-8859-1').decode('utf8')
     87             work_time = sel.xpath("//div[@class='info']/p[3]/text()").extract()
     88             wk = ''.join(work_time).encode('ISO-8859-1').decode('utf8')
     89             act = sel.xpath("//div[@class='row placeholders']/div/h4/a/text()").extract()
     90             actor = ','.join(act).encode('ISO-8859-1').decode('utf8')
     91 
     92             topic = Topic()
     93             topic.main_actor = actor
     94             topic.fanhao = fanhao
     95             topic.varieties = types
     96             topic.launch_time = launch_time
     97             topic.work_time =wk
     98             topic.work_name = name
     99 
    100             topic.save(force_insert=True)
    101 
    102 if __name__ =="__main__":
    103     # test=get_nodes_list()
    104     # print(test)
    105     # text = get_next_url()
    106     # print(text)
    107     # url = get_first_page()
    108     # all_url = get_nodes_list()
    109     # print(all_url)
    110     last_urls = demo()
    111     get_info(last_urls)

    数据库的设计:

    数据库代码也行,也可以直接用orm,在python中自动生成

    代码:

     1 from peewee import *
     2 
     3 db = MySQLDatabase("spider", host="127.0.0.1", port=3306, user="root",password="123456")
     4 
     5 class BaseModel(Model):
     6     class Meta:
     7         database = db
     8 
     9 class Topic(BaseModel):
    10     main_actor = TextField()
    11     fanhao = CharField()
    12     launch_time = DateField()
    13     work_time = CharField()
    14     work_name = TextField(default="")
    15     varieties = TextField(default="")
    16 
    17 
    18 if __name__ == "__main__":
    19     db.create_tables([Topic])

    最终在数据库中显示的就是番号+类型+主演以及发行时间了,

    这里就不过多介绍了。

     
     
  • 相关阅读:
    2020软件工程作业02
    2020软件工程作业01
    为什么需要平衡二叉树?
    手机号码和邮箱等联系地址,为什么不明文显示?
    请把重要的事看轻 ——2017年终总结
    万事皆空:随缘而定
    微服务:微服务架构模式译文说明
    Mysql 查询—按位运算
    解决:spring security 登录页停留时间过长 跳转至 403页面
    excel模板解析—桥接模式:分离解析模板和业务校验
  • 原文地址:https://www.cnblogs.com/liam-sliversucks/p/13348124.html
Copyright © 2011-2022 走看看