zoukankan      html  css  js  c++  java
  • scrapy练习

    1、爬取cnblogs首页文章,打印出标题和连接地址

    spiders/cnblogs.py

    import scrapy
    
    
    class CnblogsSpider(scrapy.Spider):
        name = 'cnblogs'
        allowed_domains = ['www.cnblogs.com']
        start_urls = ['http://www.cnblogs.com/']
    
        def parse(self, response):
            # 爬取cnblogs首页文章,打印出标题和连接地址
            # print(response.text)
            # 1 取出当前页所有文章对应的标签对象
            # article_list = response.xpath('//article[contains(@class,"post-item")]')
            article_list = response.css('.post-item')
            # print(article_list)
            for article in article_list:
                # 2 取出标题,和标题对应的url
                title = article.css('.post-item-title::text').extract_first()
                title_url = article.css('.post-item-title::attr(href)').extract_first()
                print("""
                文章标题:%s
                文章链接:%s
                """ % (title, title_url))

     2、爬取cnblogs文章,把标题连接地址和文章内容保存到mysql,连续爬取n页

    持续爬取下一页原理:

    # 我们每爬一页就用css选择器来查询,是否存在下一页链接,
    # 存在:则拼接出下一页链接,继续爬下一页链接,然后把下一页链接提交给当前爬取的函数parse,继续爬取,继续查找下一页,知道找不到下一页,说明所有页面已经爬完,那结束爬虫

    spiders/cnblogs.py

    import scrapy
    from firstscrapy.items import CnblogsMysqlItem
    
    class CnblogsSpider(scrapy.Spider):
        name = 'cnblogs'
        allowed_domains = ['www.cnblogs.com']
        start_urls = ['http://www.cnblogs.com/']
    
        def parse(self, response):
            # 爬取cnblogs文章,把标题连接地址和文章内容保存到mysql,连续爬取n页
            article_list = response.css('.post-item')
            for article in article_list:
                item = CnblogsMysqlItem()
                title = article.css('.post-item-title::text').extract_first()
                title_url = article.css('.post-item-title::attr(href)').extract_first()
                summary = article.css('.post-item-summary::text').extract_first()
                # 将解析到的数据封装至items对象中,注意:不支持item.title = title方式
                item['title'] = title
                item['title_url'] = title_url
                item['summary'] = summary
                yield item
    
            # 查找当前页是否存在下一页
            pager = response.xpath('//div[@class="pager"]')
            next_page = pager.xpath('.//a[last()]/text()').extract_first()
            # print(next_page)
            if next_page is ">":
                next_page_url = pager.xpath('.//a[last()]/@href').extract_first()
                next_page_url_full = 'https://www.cnblogs.com%s' % next_page_url
                # print(next_page_url_full)
                yield scrapy.Request(next_page_url_full, callback=self.parse)

    items.py

    import scrapy
    
    
    class CnblogsMysqlItem(scrapy.Item):
        title = scrapy.Field()
        title_url = scrapy.Field()
        summary = scrapy.Field()

    pipelines.py

    from itemadapter import ItemAdapter
    
    
    class CnblogsFilePipeline(object):
        # 下列都是在重写父类的方法:
        # 开始爬虫时,执行一次
        def open_spider(self, spider):
            self.file = open('cnblogs.txt', 'w', encoding='utf-8')
    
        # 因为该方法会被执行调用多次,所以文件的开启和关闭操作写在了另外两个只会各自执行一次的方法中。
        def process_item(self, item, spider):
            # 持久化存储到文件中
            self.file.write(item['title'] + '
    ')
            self.file.write(item['title_url'] + '
    ')
            self.file.write(item['summary'] + '
    ')
            return item
    
        # 结束爬虫时,执行一次
        def close_spider(self, spider):
            self.file.close()
    
    
    import pymysql
    
    
    class CnblogsMysqlPipeline(object):
        def open_spider(self, spider):
            self.conn = pymysql.connect(host='127.0.0.1', user='root', password="123456",
                                        database='cnblogs', port=3306)
    
        def close_spider(self, spider):
            self.conn.close()
    
        def process_item(self, item, spider):
            cursor = self.conn.cursor()
            sql = 'insert into c_cnblogs (title,title_url,summary)values(%s,%s,%s) '
            cursor.execute(sql, [item['title'], item['title_url'], item['summary']])
            self.conn.commit()
            return item

    settings.py

    # 开启管道
    ITEM_PIPELINES = {
        'firstscrapy.pipelines.CnblogsFilePipeline': 300,
        'firstscrapy.pipelines.CnblogsMysqlPipeline': 305,
        # 'firstscrapy.pipelines.ChoutiFilePipeline': 300,  #300表示为优先级,值越小优先级越高
        # 'firstscrapy.pipelines.ChoutiMysqlPipeline': 305,
    }

    3、自动给抽屉点赞

    from selenium import webdriver
    import time
    import requests
    
    bro=webdriver.Chrome(executable_path='./chromedriver.exe')
    bro.implicitly_wait(10)
    bro.get('https://dig.chouti.com/')
    
    login_b=bro.find_element_by_id('login_btn')
    print(login_b)
    login_b.click()
    
    username=bro.find_element_by_name('phone')
    username.send_keys('18953675221')
    password=bro.find_element_by_name('password')
    password.send_keys('lqz123')
    
    button=bro.find_element_by_css_selector('button.login-btn')
    button.click()
    # 可能有验证码,手动操作一下
    time.sleep(10)
    
    
    my_cookie=bro.get_cookies()  # 列表
    print(my_cookie)
    bro.close()
    
    # 这个cookie不是一个字典,不能直接给requests使用,需要转一下
    cookie={}
    for item in my_cookie:
        cookie[item['name']]=item['value']
    
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
        'Referer': 'https://dig.chouti.com/'}
    # ret = requests.get('https://dig.chouti.com/',headers=headers)
    # print(ret.text)
    
    
    ret=requests.get('https://dig.chouti.com/top/24hr?_=1596677637670',headers=headers)
    print(ret.json())
    ll=[]
    for item in ret.json()['data']:
        ll.append(item['id'])
    
    print(ll)
    for id in ll:
        ret=requests.post(' https://dig.chouti.com/link/vote',headers=headers,cookies=cookie,data={'linkId':id})
        print(ret.text)
    
    'https://dig.chouti.com/comments/create'
    '''
    content: 说的号
    linkId: 29829529
    parentId: 0
    
    '''
  • 相关阅读:
    Queue
    List
    面试1
    野指针和空指针
    指针的定义和使用
    多文件编程
    函数声明
    函数样式
    字符串比较
    函数的定义和使用
  • 原文地址:https://www.cnblogs.com/baicai37/p/13442241.html
Copyright © 2011-2022 走看看