1.案例一
a.创建项目
scrapy startproject renren_login
进入项目路径
scrapy genspider renren "renren.com"
renren.py
# -*- coding: utf-8 -*- import scrapy class RenrenSpider(scrapy.Spider): name = 'renren' allowed_domains = ['renren.com'] start_urls = ['http://renren.com/'] def start_requests(self): url="http://www.renren.com/PLogin.do" data={"email":"xxxxxxxx@126.com","password":"xxxxxxx"} request=scrapy.FormRequest(url,formdata=data,callback=self.parse_page) yield request def parse_page(self, response): request=scrapy.Request(url='http://www.renren.com/326282648/profile',callback=self.parse_profile) yield request def parse_profile(self,response): with open("wenliang.html","w",encoding="utf-8") as fp: fp.write(response.text)
在项目路径下创建start.py
from scrapy import cmdline cmdline.execute(["scrapy","crawl","renren"])
2.案例2
a.手动输入验证码
创建项目
scrapy startproject douban_login
进去项目路径
scrapy genspider douban "douban.com"
settings.py
# -*- coding: utf-8 -*- # Scrapy settings for douban_login project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'douban_login' SPIDER_MODULES = ['douban_login.spiders'] NEWSPIDER_MODULE = 'douban_login.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'douban_login (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36', } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'douban_login.middlewares.DoubanLoginSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'douban_login.middlewares.DoubanLoginDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'douban_login.pipelines.DoubanLoginPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
douban.py
# -*- coding: utf-8 -*- import scrapy from urllib import request from PIL import Image class DoubanSpider(scrapy.Spider): name = 'douban' allowed_domains = ['douban.com'] start_urls = ['https://www.douban.com/login'] login_url="https://www.douban.com/login" profile_url="https://www.douban.com/people/184480369/" editsignature_url="https://www.douban.com/j/people/184480369/edit_signature" def parse(self, response): formdata={ "source":"None", "redir":"https://www.douban.com/", "form_email":"xxxxxx@qq.com", "form_password":"xxxxxx!", "remember":"on", "login":"登录" } captcha_url=response.css("img#captcha_image::attr(src)").get() if captcha_url: captcha=self.regonize_captcha(captcha_url) formdata["captcha-solution"]=captcha captcha_id=response.xpath("//input[@name='captcha-id']/@value").get() formdata["captcha-id"]=captcha_id yield scrapy.FormRequest(url=self.login_url,formdata=formdata,callback=self.parse_after_login) def parse_after_login(self,response): if response.url=="https://www.douban.com/": yield scrapy.Request(self.profile_url,callback=self.parse_profile) print("登录成功") else: print("登录失败") def parse_profile(self,response): print(response.url) if response.url==self.profile_url: print("进入到了个人中心") ck=response.xpath("//input[@name='ck']/@value").get() formdata={ "ck":ck, "signature":"丈夫处世兮立功名" } yield scrapy.FormRequest(self.editsignature_url,formdata=formdata) else: print("没有进入个人中心") def regonize_captcha(self,image_url): request.urlretrieve(image_url,"captcha.png") image=Image.open("captcha.png") image.show() captcha=input("请输入验证码:") return captcha
在douban_login目录下创建start.py
from scrapy import cmdline cmdline.execute("scrapy crawl douban".split())
执行start.py即可
b.自动识别验证码
from urllib import request from base64 import b64decode import requests captcha_url="https://www.douban.com/misc/captcha?id=TCEAV2F8SbBgKbXZ5JAI2G6L:en&size=s" request.urlretrieve(captcha_url,"captcha.png") recognize_url="http://xxxxxx" formdata={} with open("captcha.png","rb") as fp: data=fp.read() pic=b64decode(data) formdata['pic']=pic appcode='xxxxxxxxxxxxxxx' headers={ "Content-Type":"application/x-www-form-urlencode; charset=UTF-8", 'Authorization':'APPCODE'+appcode } response=requests.post(recognize_url,data=formdata,headers=headers) print(response)
c.其他自动识别案例
from selenium import webdriver import time import requests from lxml import etree import base64 # 操作浏览器 driver = webdriver.Chrome() url = 'https://accounts.douban.com/login?alias=&redir=https%3A%2F%2Fwww.douban.com%2F&source=index_nav&error=1001' driver.get(url) time.sleep(1) driver.find_element_by_id('email').send_keys('18510556963') time.sleep(1) driver.find_element_by_id('password').send_keys('yaoqinglin2011') time.sleep(1) # 获取验证码相关信息 html_str = driver.page_source html_ele = etree.HTML(html_str) # 得到验证码的url image_url = html_ele.xpath('//img[@id="captcha_image"]/@src')[0] # 获取这个图片的内容 response = requests.get(image_url) # 获取base64的str # https://market.aliyun.com/products/57124001/cmapi028447.html?spm=5176.2020520132.101.5.2HEXEG#sku=yuncode2244700000 b64_str = base64.b64encode(response.content) v_type = 'cn' # post 提交打码平台的数据 form = { 'v_pic': b64_str, 'v_type': v_type, } # authtication的header headers = { 'Authorization': 'APPCODE eab23fa1d03f40d48b43c826c57bd284', } # 从打码平台获取验证码信息 dmpt_url = 'http://yzmplus.market.alicloudapi.com/fzyzm' response = requests.post(dmpt_url, form, headers=headers) print(response.text) # captcha_value 就是我们的验证码信息 captcha_value = response.json()['v_code'] print(image_url) print(captcha_value) # captcha_value = input('请输入验证码') driver.find_element_by_id('captcha_field').send_keys(captcha_value) time.sleep(1) driver.find_element_by_class_name('btn-submit').click() time.sleep(1) # 获取所有的cookie的信息 cookies = driver.get_cookies() cookie_list =[] # 对于每一个cookie_dict, 就是将name 和 value取出, 拼接成name=value; for cookie_dict in cookies: cookie_str = cookie_dict['name'] + '=' + cookie_dict['value'] cookie_list.append(cookie_str) # 拼接所有的cookie到header_cookie中 header_cookie = '; '.join(cookie_list) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36', 'Cookie': header_cookie, } another_url = 'https://www.douban.com/accounts/' response = requests.get(another_url, headers=headers) with open('cc.html', 'wb') as f: f.write(response.content) # with open('douban.html', 'wb') as f: # f.write(driver.page_source.encode('utf-8'))