1、爬取cnblogs首页文章,打印出标题和连接地址
spiders/cnblogs.py
import scrapy class CnblogsSpider(scrapy.Spider): name = 'cnblogs' allowed_domains = ['www.cnblogs.com'] start_urls = ['http://www.cnblogs.com/'] def parse(self, response): # 爬取cnblogs首页文章,打印出标题和连接地址 # print(response.text) # 1 取出当前页所有文章对应的标签对象 # article_list = response.xpath('//article[contains(@class,"post-item")]') article_list = response.css('.post-item') # print(article_list) for article in article_list: # 2 取出标题,和标题对应的url title = article.css('.post-item-title::text').extract_first() title_url = article.css('.post-item-title::attr(href)').extract_first() print(""" 文章标题:%s 文章链接:%s """ % (title, title_url))
2、爬取cnblogs文章,把标题连接地址和文章内容保存到mysql,连续爬取n页
持续爬取下一页原理:
# 我们每爬一页就用css选择器来查询,是否存在下一页链接, # 存在:则拼接出下一页链接,继续爬下一页链接,然后把下一页链接提交给当前爬取的函数parse,继续爬取,继续查找下一页,知道找不到下一页,说明所有页面已经爬完,那结束爬虫
spiders/cnblogs.py
import scrapy from firstscrapy.items import CnblogsMysqlItem class CnblogsSpider(scrapy.Spider): name = 'cnblogs' allowed_domains = ['www.cnblogs.com'] start_urls = ['http://www.cnblogs.com/'] def parse(self, response): # 爬取cnblogs文章,把标题连接地址和文章内容保存到mysql,连续爬取n页 article_list = response.css('.post-item') for article in article_list: item = CnblogsMysqlItem() title = article.css('.post-item-title::text').extract_first() title_url = article.css('.post-item-title::attr(href)').extract_first() summary = article.css('.post-item-summary::text').extract_first() # 将解析到的数据封装至items对象中,注意:不支持item.title = title方式 item['title'] = title item['title_url'] = title_url item['summary'] = summary yield item # 查找当前页是否存在下一页 pager = response.xpath('//div[@class="pager"]') next_page = pager.xpath('.//a[last()]/text()').extract_first() # print(next_page) if next_page is ">": next_page_url = pager.xpath('.//a[last()]/@href').extract_first() next_page_url_full = 'https://www.cnblogs.com%s' % next_page_url # print(next_page_url_full) yield scrapy.Request(next_page_url_full, callback=self.parse)
items.py
import scrapy class CnblogsMysqlItem(scrapy.Item): title = scrapy.Field() title_url = scrapy.Field() summary = scrapy.Field()
pipelines.py
from itemadapter import ItemAdapter class CnblogsFilePipeline(object): # 下列都是在重写父类的方法: # 开始爬虫时,执行一次 def open_spider(self, spider): self.file = open('cnblogs.txt', 'w', encoding='utf-8') # 因为该方法会被执行调用多次,所以文件的开启和关闭操作写在了另外两个只会各自执行一次的方法中。 def process_item(self, item, spider): # 持久化存储到文件中 self.file.write(item['title'] + ' ') self.file.write(item['title_url'] + ' ') self.file.write(item['summary'] + ' ') return item # 结束爬虫时,执行一次 def close_spider(self, spider): self.file.close() import pymysql class CnblogsMysqlPipeline(object): def open_spider(self, spider): self.conn = pymysql.connect(host='127.0.0.1', user='root', password="123456", database='cnblogs', port=3306) def close_spider(self, spider): self.conn.close() def process_item(self, item, spider): cursor = self.conn.cursor() sql = 'insert into c_cnblogs (title,title_url,summary)values(%s,%s,%s) ' cursor.execute(sql, [item['title'], item['title_url'], item['summary']]) self.conn.commit() return item
settings.py
# 开启管道 ITEM_PIPELINES = { 'firstscrapy.pipelines.CnblogsFilePipeline': 300, 'firstscrapy.pipelines.CnblogsMysqlPipeline': 305, # 'firstscrapy.pipelines.ChoutiFilePipeline': 300, #300表示为优先级,值越小优先级越高 # 'firstscrapy.pipelines.ChoutiMysqlPipeline': 305, }
from selenium import webdriver import time import requests bro=webdriver.Chrome(executable_path='./chromedriver.exe') bro.implicitly_wait(10) bro.get('https://dig.chouti.com/') login_b=bro.find_element_by_id('login_btn') print(login_b) login_b.click() username=bro.find_element_by_name('phone') username.send_keys('18953675221') password=bro.find_element_by_name('password') password.send_keys('lqz123') button=bro.find_element_by_css_selector('button.login-btn') button.click() # 可能有验证码,手动操作一下 time.sleep(10) my_cookie=bro.get_cookies() # 列表 print(my_cookie) bro.close() # 这个cookie不是一个字典,不能直接给requests使用,需要转一下 cookie={} for item in my_cookie: cookie[item['name']]=item['value'] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36', 'Referer': 'https://dig.chouti.com/'} # ret = requests.get('https://dig.chouti.com/',headers=headers) # print(ret.text) ret=requests.get('https://dig.chouti.com/top/24hr?_=1596677637670',headers=headers) print(ret.json()) ll=[] for item in ret.json()['data']: ll.append(item['id']) print(ll) for id in ll: ret=requests.post(' https://dig.chouti.com/link/vote',headers=headers,cookies=cookie,data={'linkId':id}) print(ret.text) 'https://dig.chouti.com/comments/create' ''' content: 说的号 linkId: 29829529 parentId: 0 '''