前几天咕咕了几天,最近又有了新的研究成果,爬取番号站,请忽略内容这只是学习☺️
1 import requests 2 import re 3 import ast 4 5 from urllib import parse 6 from scrapy import Selector 7 from MyProjects.test_model import * 8 9 10 domin = "https://www.9fh.org" 11 12 # def get_first_page(): 13 # #获取第一页妹子的所有url 14 # pages = requests.get("https://www.9fh.org").text 15 # sel = Selector(text=pages) 16 # first_p_url = sel.xpath("//ul[@class='nav navbar-nav navbar-right']/li[2]/a/@href").extract() 17 # first_url = parse.urljoin(domin,first_p_url[0]) 18 # return first_url 19 20 all_urls = [] 21 22 def get_nodes_list(): 23 #获取第一页中所有妹子的url 24 pages = requests.get("https://www.9fh.org/special-show-p-1.html").text 25 sel = Selector(text=pages) 26 p1_girls_url = sel.xpath("//div[@class='row placeholders']/div/h4/a/@href").extract() 27 #获取第一页中的所有妹子 28 url_list = [] 29 for te in p1_girls_url: 30 url_list.append(parse.urljoin(domin, te)) 31 # all_urls = [] 32 # for ar in p1_girls_url: 33 # all_urls.append(parse.urljoin(domin, ar)) 34 # for tr in all_urls: 35 # if tr not in url_list: 36 # url_list.append(tr) 37 # 38 # next_page = sel.xpath("//ul[@class='pagination']/a[7]/@href").extract() 39 # if next_page: 40 # next_url = parse.urljoin(domin, next_page[0]) 41 # get_nodes_list(next_url) 42 43 return url_list 44 45 def get_all_url(url): 46 #url为妹子的首页链接 47 #得到所有url 48 pages = requests.get(url).text 49 sel = Selector(text=pages) 50 urls = sel.xpath("//table[@class='table table-striped']/tbody/tr/td[2]/a/@href").extract() 51 #urls为妹子首页的作品链接/ 52 work_url = [] 53 all_url = [] 54 for tp in urls: 55 all_url.append(parse.urljoin(domin, tp)) 56 # print(all_url) 57 # a = 1 58 # next_page = sel.xpath("//ul[@class='pagination']/a[4]/@href").extract() 59 # if next_page: 60 # next_url = parse.urljoin(domin, next_page[0]) 61 # get_all_url(next_url) 62 # last_urls = [] 63 # for url in all_urls: 64 # if p1_girls_url not in all_urls: 65 # last_urls.append(p1_girls_url) 66 return all_url 67 68 def demo(): 69 all_urls = [] 70 url_list = get_nodes_list() 71 for url in url_list: 72 all_urls.append(get_all_url(url)) 73 return all_urls 74 75 def get_info(last_urls): 76 for single_url in last_urls: 77 for i in single_url: 78 pages = requests.get(i).text 79 sel = Selector(text=pages) 80 # barsize = sel.xpath("//div[@class='col-xs-6 col-md-10 info']/p[5]/text()").extract() 81 # work_name = sel.xpath("//table[@class='table table-striped']/tbody/tr/td[2]/a/text()").extract() 82 name = sel.xpath("//div[@class='row']/div[1]/h2[1]/a/text()").extract()[0].encode('ISO-8859-1').decode('utf8') 83 fanhao = sel.xpath("//div[@class='info']/p[1]/span[2]/text()").extract()[0].encode('ISO-8859-1').decode('utf8') 84 launch_time = sel.xpath("//div[@class='info']/p[2]/text()").extract()[0] 85 varieties = sel.xpath("//div[@class='info']/p[6]/span/a/text()").extract() 86 types = ','.join(varieties).encode('ISO-8859-1').decode('utf8') 87 work_time = sel.xpath("//div[@class='info']/p[3]/text()").extract() 88 wk = ''.join(work_time).encode('ISO-8859-1').decode('utf8') 89 act = sel.xpath("//div[@class='row placeholders']/div/h4/a/text()").extract() 90 actor = ','.join(act).encode('ISO-8859-1').decode('utf8') 91 92 topic = Topic() 93 topic.main_actor = actor 94 topic.fanhao = fanhao 95 topic.varieties = types 96 topic.launch_time = launch_time 97 topic.work_time =wk 98 topic.work_name = name 99 100 topic.save(force_insert=True) 101 102 if __name__ =="__main__": 103 # test=get_nodes_list() 104 # print(test) 105 # text = get_next_url() 106 # print(text) 107 # url = get_first_page() 108 # all_url = get_nodes_list() 109 # print(all_url) 110 last_urls = demo() 111 get_info(last_urls)
数据库的设计:
数据库代码也行,也可以直接用orm,在python中自动生成
代码:
1 from peewee import * 2 3 db = MySQLDatabase("spider", host="127.0.0.1", port=3306, user="root",password="123456") 4 5 class BaseModel(Model): 6 class Meta: 7 database = db 8 9 class Topic(BaseModel): 10 main_actor = TextField() 11 fanhao = CharField() 12 launch_time = DateField() 13 work_time = CharField() 14 work_name = TextField(default="") 15 varieties = TextField(default="") 16 17 18 if __name__ == "__main__": 19 db.create_tables([Topic])
最终在数据库中显示的就是番号+类型+主演以及发行时间了,
这里就不过多介绍了。