今日学习2h
进行了爬虫实操,过程中难点感觉是在xpath的时候网页源代码可以找到,但是网页解析后的代码中找不到相应的位置。

import re import requests from fake_useragent import UserAgent url='https://www.qiushibaike.com/text/' headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62" } resp=requests.get(url,headers=headers) # print(resp.text) contents=re.findall(r'<div class="content">\s*<span>\s*(.+)\s*</span>',resp.text) with open("duanzi.txt",'a',encoding='utf-8') as f: for info in contents: f.write(info+"\n\n")

from fake_useragent import UserAgent import requests from lxml import etree from time import sleep def get_html(url): ''' :param url: 爬取的地址 :return: 返回html ''' headers={ "UserAgent":UserAgent().chrome } resp=requests.get(url,headers=headers) sleep(2) if resp.status_code==200: resp.encoding='utf-8' return resp.text else: return None def parse_list(html): ''' :param html: 传递进来一个有电影列表的html :return: 返回一个电影列表的url ''' e=etree.HTML(html) list_url=['http://maoyan.com{}'.format(url) for url in e.xpath('//div[@class="movie-item film-channel"]/a/@href')] return list_url def parse_index(html): ''' :param html: 传递一个有电影信息的html :return: 已经提取好的电影信息 ''' e=etree.HTML(html) print(etree.tostring(e).decode()) name=e.xpath('//h1[@class="name"]/text()') type=e.xpath('//li[@class="ellipsis"]/a/text()') actors=e.xpath('//ul[@class="celebrity-list clearfix"]/li/div/a/text()') actors=format(actors) return {"name":name,"type":type,"actors":actors} def format_data(actors): actor_set=set() for actor in actors: actor_set.add(actor.strip()) return actor_set def main(): num=int(input("请输入要获取多少页")) for page in range(num): url='http://maoyan.com/films?showType=3&offset={}'.format(page*30) print(url) list_html=get_html(url) list_url=parse_list(list_html) for url in list_url: print(url) info_html=get_html(url) movie=parse_index(info_html) print(movie) if __name__=='__main__': main()

from fake_useragent import UserAgent import requests from lxml import etree #发送请求 class Downloader(): def do_download(self,url): print(url) headers={ 'User-Agent':UserAgent().chrome } resp=requests.get(url,headers=headers) if resp.status_code==200: resp.encoding='utf-8' return resp.text #数据解析 class Parser(): def do_parse(self,html): e=etree.HTML(html) contents=[div.xpath('string(.)').strip() for div in e.xpath('//div[@class="content"]')] urls=['https://www.qiushibaike.com{}'.format(url) for url in e.xpath('//ul[@class="pagination"]/li/a/@href')] return contents,urls #数据保存 class DataOutPut(): def do_save(self,datas): with open('duanzi3.txt','a',encoding='utf-8') as f: for data in datas: f.write(data+'\n') #URL管理器 class URLManager(): def __init__(self): self.new_url=set() self.old_url=set() #加入一个url def add_new_url(self,url): if url is not None and url !='' and url not in self.old_url: self.new_url.add(url) #加入多个url def add_new_urls(self,urls): for url in urls: self.add_new_url(url) #获取一个url def get_new_url(self): url=self.new_url.pop() self.old_url.add(url) return url #获取还有多少个url要爬取 def get_new_url_size(self): return len(self.new_url) #获取是否还有url要爬取 def is_have_new_url(self): return self.get_new_url_size()>0 #调度器 class Scheduler: def __init__(self): self.downloader=Downloader() self.parser=Parser() self.data_out_put=DataOutPut() self.url_manager=URLManager() def start(self,url): self.url_manager.add_new_url(url) while self.url_manager.is_have_new_url(): url=self.url_manager.get_new_url() html=self.downloader.do_download(url) datas,urls=self.parser.do_parse(html) self.data_out_put.do_save(datas) self.url_manager.add_new_urls(urls) if __name__=='__main__': scheduler=Scheduler() url='https://www.qiushibaike.com/text' scheduler.start(url)

from threading import Thread import requests from lxml import etree from fake_useragent import UserAgent from queue import Queue class Spider(Thread): def __init__(self,url_queue): Thread.__init__(self) self.url_queue=url_queue def run(self): while not self.url_queue.empty(): url=self.url_queue.get() print(url) headers={ 'UserAgent':UserAgent().chrome } resp=requests.get(url,headers=headers) e=etree.HTML(resp.text) contents=[div.xpath('string(.)').strip() for div in e.xpath('//div[@class="content"]')] # print(contents) with open('duanzi2.txt', 'a', encoding='utf-8') as f: for content in contents: f.write(content+'\n') if __name__=='__main__': base_url='https://www.qiushibaike.com/text/page/{}/' url_queue=Queue() for num in range(1,6): url_queue.put(base_url.format(num)) for num in range(3): spider=Spider(url_queue) spider.start()