zoukankan html css js c++ java

scrapy练习

1、爬取cnblogs首页文章，打印出标题和连接地址

spiders/cnblogs.py

import scrapy


class CnblogsSpider(scrapy.Spider):
    name = 'cnblogs'
    allowed_domains = ['www.cnblogs.com']
    start_urls = ['http://www.cnblogs.com/']

    def parse(self, response):
        # 爬取cnblogs首页文章，打印出标题和连接地址
        # print(response.text)
        # 1 取出当前页所有文章对应的标签对象
        # article_list = response.xpath('//article[contains(@class,"post-item")]')
        article_list = response.css('.post-item')
        # print(article_list)
        for article in article_list:
            # 2 取出标题，和标题对应的url
            title = article.css('.post-item-title::text').extract_first()
            title_url = article.css('.post-item-title::attr(href)').extract_first()
            print("""
            文章标题：%s
            文章链接：%s
            """ % (title, title_url))

2、爬取cnblogs文章，把标题连接地址和文章内容保存到mysql，连续爬取n页

持续爬取下一页原理：

# 我们每爬一页就用css选择器来查询，是否存在下一页链接，
# 存在：则拼接出下一页链接，继续爬下一页链接，然后把下一页链接提交给当前爬取的函数parse，继续爬取，继续查找下一页，知道找不到下一页，说明所有页面已经爬完，那结束爬虫

spiders/cnblogs.py

import scrapy
from firstscrapy.items import CnblogsMysqlItem

class CnblogsSpider(scrapy.Spider):
    name = 'cnblogs'
    allowed_domains = ['www.cnblogs.com']
    start_urls = ['http://www.cnblogs.com/']

    def parse(self, response):
        # 爬取cnblogs文章，把标题连接地址和文章内容保存到mysql，连续爬取n页
        article_list = response.css('.post-item')
        for article in article_list:
            item = CnblogsMysqlItem()
            title = article.css('.post-item-title::text').extract_first()
            title_url = article.css('.post-item-title::attr(href)').extract_first()
            summary = article.css('.post-item-summary::text').extract_first()
            # 将解析到的数据封装至items对象中,注意：不支持item.title = title方式
            item['title'] = title
            item['title_url'] = title_url
            item['summary'] = summary
            yield item

        # 查找当前页是否存在下一页
        pager = response.xpath('//div[@class="pager"]')
        next_page = pager.xpath('.//a[last()]/text()').extract_first()
        # print(next_page)
        if next_page is ">":
            next_page_url = pager.xpath('.//a[last()]/@href').extract_first()
            next_page_url_full = 'https://www.cnblogs.com%s' % next_page_url
            # print(next_page_url_full)
            yield scrapy.Request(next_page_url_full, callback=self.parse)

items.py

import scrapy


class CnblogsMysqlItem(scrapy.Item):
    title = scrapy.Field()
    title_url = scrapy.Field()
    summary = scrapy.Field()

pipelines.py

from itemadapter import ItemAdapter


class CnblogsFilePipeline(object):
    # 下列都是在重写父类的方法：
    # 开始爬虫时，执行一次
    def open_spider(self, spider):
        self.file = open('cnblogs.txt', 'w', encoding='utf-8')

    # 因为该方法会被执行调用多次，所以文件的开启和关闭操作写在了另外两个只会各自执行一次的方法中。
    def process_item(self, item, spider):
        # 持久化存储到文件中
        self.file.write(item['title'] + '
')
        self.file.write(item['title_url'] + '
')
        self.file.write(item['summary'] + '
')
        return item

    # 结束爬虫时，执行一次
    def close_spider(self, spider):
        self.file.close()


import pymysql


class CnblogsMysqlPipeline(object):
    def open_spider(self, spider):
        self.conn = pymysql.connect(host='127.0.0.1', user='root', password="123456",
                                    database='cnblogs', port=3306)

    def close_spider(self, spider):
        self.conn.close()

    def process_item(self, item, spider):
        cursor = self.conn.cursor()
        sql = 'insert into c_cnblogs (title,title_url,summary)values(%s,%s,%s) '
        cursor.execute(sql, [item['title'], item['title_url'], item['summary']])
        self.conn.commit()
        return item

settings.py

# 开启管道
ITEM_PIPELINES = {
    'firstscrapy.pipelines.CnblogsFilePipeline': 300,
    'firstscrapy.pipelines.CnblogsMysqlPipeline': 305,
    # 'firstscrapy.pipelines.ChoutiFilePipeline': 300,  #300表示为优先级，值越小优先级越高
    # 'firstscrapy.pipelines.ChoutiMysqlPipeline': 305,
}

3、自动给抽屉点赞

from selenium import webdriver
import time
import requests

bro=webdriver.Chrome(executable_path='./chromedriver.exe')
bro.implicitly_wait(10)
bro.get('https://dig.chouti.com/')

login_b=bro.find_element_by_id('login_btn')
print(login_b)
login_b.click()

username=bro.find_element_by_name('phone')
username.send_keys('18953675221')
password=bro.find_element_by_name('password')
password.send_keys('lqz123')

button=bro.find_element_by_css_selector('button.login-btn')
button.click()
# 可能有验证码，手动操作一下
time.sleep(10)


my_cookie=bro.get_cookies()  # 列表
print(my_cookie)
bro.close()

# 这个cookie不是一个字典，不能直接给requests使用，需要转一下
cookie={}
for item in my_cookie:
    cookie[item['name']]=item['value']


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
    'Referer': 'https://dig.chouti.com/'}
# ret = requests.get('https://dig.chouti.com/',headers=headers)
# print(ret.text)


ret=requests.get('https://dig.chouti.com/top/24hr?_=1596677637670',headers=headers)
print(ret.json())
ll=[]
for item in ret.json()['data']:
    ll.append(item['id'])

print(ll)
for id in ll:
    ret=requests.post(' https://dig.chouti.com/link/vote',headers=headers,cookies=cookie,data={'linkId':id})
    print(ret.text)

'https://dig.chouti.com/comments/create'
'''
content: 说的号
linkId: 29829529
parentId: 0

'''

查看全文

相关阅读:
sql-select for update
java-JDK动态代理
 idea-热部署jreble的使用
 vue-【el-table】转成【pl-table】
mybatis-字段值为null或为''无法存储到数据库
 vue-本地开发热部署编译缓慢
 chrome-截长图
 el-cascader 级联选择器中选中任意一级去掉圆形按钮
 idea-绿色注释颜色16进制
 markdown 语法

原文地址：https://www.cnblogs.com/baicai37/p/13442241.html