1.移动端数据
配置fiddler tools->options->connection->allow remote computer to connect fiddler port: xxxx 移动端安装fiddler的证书: 保证移动端和fiddler所在的pc的网络在同一个网段下 在移动端的浏览器中:fiddler所在机器的ip地址:fiddler的端口号 证书下载完毕后进行安装切信任 配置手机的网络: 给手机设置一个代理ip:port:
2.scrapy的初步使用
settings
ROBOTSTXT_OBEY = False USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36' ITEM_PIPELINES = { 'firstblood.pipelines.FirstbloodPipeline': 300, }
# -*- coding: utf-8 -*- import scrapy class FirstSpider(scrapy.Spider): # 爬虫文件的名称 name = 'first' # 允许的域名 # allowed_domains = ['www.xxx.com'] # 起始url列表 start_urls = ['https://www.qiushibaike.com/text/'] # def parse(self, response): # div_list = response.xpath('//div[@id="content-left"]/div') # for div in div_list: # # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() # # 如果可以保证xpath返回的列表中只有一个元素可以用.extract_first() # author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() # content = div.xpath('./a[1]/div/span//text()').extract() # content = ''.join(content) # # print(author, content) # # # 实现解析+持久化存储 # # 1.基于终端指令的持久化存储 # # 只可以将parse方法的返回值持久化存储到本地文件中 # # 2.基于管道的持久化存储 # 1.基于终端指令的持久化存储 scrapy crawl first -o xxx.csv def parse(self, response): div_list = response.xpath('//div[@id="content-left"]/div') all_data = [] for div in div_list: # author = div.xpath('./div[1]/a[2]/h2/text()')[0].extract() # 如果可以保证xpath返回的列表中只有一个元素可以用.extract_first() author = div.xpath('./div[1]/a[2]/h2/text()').extract_first() content = div.xpath('./a[1]/div/span//text()').extract() content = ''.join(content) dic = { 'author': author, 'content': content } all_data.append(dic) return all_data
解析数据+管道持久化存储
settings
ITEM_PIPELINES = { 'boosPro.pipelines.BoosproPipeline': 300, 'boosPro.pipelines.MysqlPipeline': 301, 'boosPro.pipelines.RedisPipeline': 302, }
# -*- coding: utf-8 -*- import scrapy from boosPro.items import BoosproItem class BossSpider(scrapy.Spider): name = 'boss' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&scity=101010100&industry=&position='] url = 'https://www.zhipin.com/c101010100/?query=python爬虫&page=%d&ka=page-2' page = 1 # 解析+管道持久化存储 def parse(self, response): li_list = response.xpath('//div[@class="job-list"]/ul/li') for li in li_list: job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div/text()').extract_first() salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first() company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first() # 实例化一个item对象 item = BoosproItem() # 将解析到的数据全部封装到item对象中 item["job_name"] = job_name item["salary"] = salary item["company"] = company # 将item提交给管道 yield item if self.page <= 3: print("执行!!!") self.page += 1 new_url = format(self.url % self.page) print(new_url) # 手动发起请求 yield scrapy.Request(url=new_url, callback=self.parse)
items
class BoosproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() job_name = scrapy.Field() salary = scrapy.Field() company = scrapy.Field()
pipelines
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql from redis import Redis class BoosproPipeline(object): fp = None def open_spider(self, spider): print("开始爬虫......") self.fp = open('./boss.txt', 'w', encoding='utf-8') def close_spider(self, spider): print("结束爬虫.......") self.fp.close() # 爬虫文件每向管道提交一次item,则该方法就被调用一次 # 参数:item 就是管道接收到的item类型对象 def process_item(self, item, spider): self.fp.write(item["job_name"] + ":" + item["salary"] + ":" + item["company"] + " ") return item class MysqlPipeline(object): conn = None cursor = None def open_spider(self, spider): self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='scrapy', charset='utf8') print(self.conn) def process_item(self, item, spider): self.cursor = self.conn.cursor() try: self.cursor.execute('insert into boss values ("%s", "%s", "%s")' % (item["job_name"], item["salary"], item["company"])) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self, spider): self.conn.close() self.cursor.close() class RedisPipeline(object): conn = None def open_spider(self, spider): self.conn = Redis(host='127.0.0.1', port=6379) print(self.conn) def process_item(self, item, spider): dic = { 'name': item["job_name"], 'salary': item["salary"], 'company': item["company"] } self.conn.lpush('boss', dic)