响应状态码
状态码 | 说明 | 详情 |
---|---|---|
100 | 继续 | 请求者应当继续提出请求。服务器已收到请求的一部分,正在等待其余部分 |
101 | 切换协议 | 请求者已要求服务器切换协议,服务器已确认并准备切换 |
200 | 成功 | 服务器已成功处理了请求 |
201 | 已创建 | 请求成功并且服务器创建了新的资源 |
202 | 已接受 | 服务器已接受请求,但尚未处理 |
203 | 非授权信息 | 服务器已成功处理了请求,但返回的信息可能来自另一个源 |
204 | 无内容 | 服务器成功处理了请求,但没有返回任何内容 |
205 | 重置内容 | 服务器成功处理了请求,内容被重置 |
206 | 部分内容 | 服务器成功处理了部分请求 |
300 | 多种选择 | 针对请求,服务器可执行多种操作 |
301 | 永久移动 | 请求的网页已永久移动到新位置,即永久重定向 |
302 | 临时移动 | 请求的网页暂时跳转到其他页面,即暂时重定向 |
303 | 查看其他位置 | 如果原来的请求时POST,重定向目标文档应该通过GET提取 |
304 | 未修改 | 此次请求返回的网页未修改,继续使用上次的资源 |
305 | 使用代理 | 请求者应该使用代理访问该网页 |
307 | 临时重定向 | 请求的资源临时从其他位置响应 |
400 | 错误请求 | 服务器无法解析该请求 |
401 | 未授权 | 请求没有进行身份验证或验证未通过 |
403 | 禁止访问 | 服务器拒绝此请求 |
404 | 未找到 | 服务器找不到请求的网页 |
405 | 方法禁用 | 服务器禁用了请求中指定的方法 |
406 | 不接受 | 无法使用请求的内容响应请求的网页 |
407 | 需要代理授权 | 请求者需要使用代理授权 |
408 | 请求超时 | 服务器请求超时 |
409 | 冲突 | 服务器在完成请求时发生冲突 |
410 | 已删除 | 请求的资源已永久删除 |
411 | 需要有效长度 | 服务器不接受不含有效长度标头字段的请求 |
412 | 未满足前提条件 | 服务器未满足请求者在请求中设置的其中一个前提条件 |
413 | 请求实体过大 | 请求实体过大,超出服务器的处理能力 |
414 | 请求URL过长 | 请求网址过长,服务器无法处理 |
415 | 不支持类型 | 请求格式不被请求页面支持 |
416 | 请求范围不符 | 页面无法提供请求的范围 |
417 | 未满足期望值 | 服务器未满足期望请求标头字段的要求 |
500 | 服务器内部错误 | 服务器遇到错误,无法完成请求 |
501 | 未实现 | 服务器不具备完成请求的功能 |
502 | 错误网关 | 服务器作为网关或代理,从上游服务器收到无效响应 |
503 | 服务不可用 | 服务器目前无法使用 |
504 | 网关超时 | 服务器作为网关或代理,但是没有及时从上游服务器收到请求 |
505 | HTTP版本不支持 | 服务器不支持请求中所用的HTTP协议版本 |
Requests的高级用法
文件上传
import requests files = {'file':open('favicon.ico','rb')} r = requests.post('http://httpbin.org/post',files=files) print(r.text)
Cookies
import requests r = requests.get('https://www.baidu.com') print(r.cookies) for key,value in r.cookies.items(): print(key+'='+value)
会话维持
import requests s = requests.Session() s.get('http://httpbin.org/cookies/set/number/123456789') r = s.get('http://httpbin.org/cookies') print(r.text)
SSL证书验证
import requests response = requests.get('https://www.12306.cn') print(response.status_code)
代理设置
import requests proxies = { 'http':'socks5://user:password@host:port', 'https':'socks5://user:password@host:port' } requests.get('https://www.taobao.com',proxies=proxies)
身份验证
import requests from requests.auth import HTTPBasicAuth r = requests.get('http://localhost:5000',auth=HTTPBasicAuth('username','password')) print(r.status_code)
爬取实例
动态渲染页面爬取
#会自动弹出谷歌浏览器,先跳转到百度,然后在搜索框中输入python,接着跳转到搜索结果页
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
try:
browser.get('https://www.baidu.com')
input = browser.find_element_by_id('kw')
input.send_keys('Python')
input.send_keys(Keys.ENTER)
wait = WebDriverWait(browser,10)
wait.until(EC.presence_of_all_elements_located((By.ID,'content_left')))
print(browser.current_url)
print(browser.get_cookies())
print(browser.page_source)
finally:
browser.close()
验证码识别
- 图形验证码
import tesserocr
from PIL import Image
image = Image.open('code.jpg')
result = tesserocr.image_to_text(image)
print(result)
- 极验滑动验证码识别
import time from io import BytesIO from PIL import Image from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC EMAIL = 'cqc@cuiqingcai.com' PASSWORD = '' BORDER = 6 INIT_LEFT = 60 class CrackGeetest(): def __init__(self): self.url = 'https://account.geetest.com/login' self.browser = webdriver.Chrome() self.wait = WebDriverWait(self.browser, 20) self.email = EMAIL self.password = PASSWORD def __del__(self): self.browser.close() def get_geetest_button(self): """ 获取初始验证按钮 :return: """ button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_radar_tip'))) return button def get_position(self): """ 获取验证码位置 :return: 验证码位置元组 """ img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_canvas_img'))) time.sleep(2) location = img.location size = img.size top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[ 'width'] return (top, bottom, left, right) def get_screenshot(self): """ 获取网页截图 :return: 截图对象 """ screenshot = self.browser.get_screenshot_as_png() screenshot = Image.open(BytesIO(screenshot)) return screenshot def get_slider(self): """ 获取滑块 :return: 滑块对象 """ slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_slider_button'))) return slider def get_geetest_image(self, name='captcha.png'): """ 获取验证码图片 :return: 图片对象 """ top, bottom, left, right = self.get_position() print('验证码位置', top, bottom, left, right) screenshot = self.get_screenshot() captcha = screenshot.crop((left, top, right, bottom)) captcha.save(name) return captcha def open(self): """ 打开网页输入用户名密码 :return: None """ self.browser.get(self.url) email = self.wait.until(EC.presence_of_element_located((By.ID, 'email'))) password = self.wait.until(EC.presence_of_element_located((By.ID, 'password'))) email.send_keys(self.email) password.send_keys(self.password) def get_gap(self, image1, image2): """ 获取缺口偏移量 :param image1: 不带缺口图片 :param image2: 带缺口图片 :return: """ left = 60 for i in range(left, image1.size[0]): for j in range(image1.size[1]): if not self.is_pixel_equal(image1, image2, i, j): left = i return left return left def is_pixel_equal(self, image1, image2, x, y): """ 判断两个像素是否相同 :param image1: 图片1 :param image2: 图片2 :param x: 位置x :param y: 位置y :return: 像素是否相同 """ # 取两个图片的像素点 pixel1 = image1.load()[x, y] pixel2 = image2.load()[x, y] threshold = 60 if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs( pixel1[2] - pixel2[2]) < threshold: return True else: return False def get_track(self, distance): """ 根据偏移量获取移动轨迹 :param distance: 偏移量 :return: 移动轨迹 """ # 移动轨迹 track = [] # 当前位移 current = 0 # 减速阈值 mid = distance * 4 / 5 # 计算间隔 t = 0.2 # 初速度 v = 0 while current < distance: if current < mid: # 加速度为正2 a = 2 else: # 加速度为负3 a = -3 # 初速度v0 v0 = v # 当前速度v = v0 + at v = v0 + a * t # 移动距离x = v0t + 1/2 * a * t^2 move = v0 * t + 1 / 2 * a * t * t # 当前位移 current += move # 加入轨迹 track.append(round(move)) return track def move_to_gap(self, slider, track): """ 拖动滑块到缺口处 :param slider: 滑块 :param track: 轨迹 :return: """ ActionChains(self.browser).click_and_hold(slider).perform() for x in track: ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=0).perform() time.sleep(0.5) ActionChains(self.browser).release().perform() def login(self): """ 登录 :return: None """ submit = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'login-btn'))) submit.click() time.sleep(10) print('登录成功') def crack(self): # 输入用户名密码 self.open() # 点击验证按钮 button = self.get_geetest_button() button.click() # 获取验证码图片 image1 = self.get_geetest_image('captcha1.png') # 点按呼出缺口 slider = self.get_slider() slider.click() # 获取带缺口的验证码图片 image2 = self.get_geetest_image('captcha2.png') # 获取缺口位置 gap = self.get_gap(image1, image2) print('缺口位置', gap) # 减去缺口位移 gap -= BORDER # 获取移动轨迹 track = self.get_track(gap) print('滑动轨迹', track) # 拖动滑块 self.move_to_gap(slider, track) success = self.wait.until( EC.text_to_be_present_in_element((By.CLASS_NAME, 'geetest_success_radar_tip_content'), '验证成功')) print(success) # 失败后重试 if not success: self.crack() else: self.login() if __name__ == '__main__': crack = CrackGeetest() crack.crack()
- 点击验证码的识别
import time from io import BytesIO from PIL import Image from selenium import webdriver from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from chaojiying import Chaojiying EMAIL = 'cqc@cuiqingcai.com' PASSWORD = '' CHAOJIYING_USERNAME = 'Germey' CHAOJIYING_PASSWORD = '' CHAOJIYING_SOFT_ID = 893590 CHAOJIYING_KIND = 9102 class CrackTouClick(): def __init__(self): self.url = 'http://admin.touclick.com/login.html' self.browser = webdriver.Chrome() self.wait = WebDriverWait(self.browser, 20) self.email = EMAIL self.password = PASSWORD self.chaojiying = Chaojiying(CHAOJIYING_USERNAME, CHAOJIYING_PASSWORD, CHAOJIYING_SOFT_ID) def __del__(self): self.browser.close() def open(self): """ 打开网页输入用户名密码 :return: None """ self.browser.get(self.url) email = self.wait.until(EC.presence_of_element_located((By.ID, 'email'))) password = self.wait.until(EC.presence_of_element_located((By.ID, 'password'))) email.send_keys(self.email) password.send_keys(self.password) def get_touclick_button(self): """ 获取初始验证按钮 :return: """ button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'touclick-hod-wrap'))) return button def get_touclick_element(self): """ 获取验证图片对象 :return: 图片对象 """ element = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'touclick-pub-content'))) return element def get_position(self): """ 获取验证码位置 :return: 验证码位置元组 """ element = self.get_touclick_element() time.sleep(2) location = element.location size = element.size top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[ 'width'] return (top, bottom, left, right) def get_screenshot(self): """ 获取网页截图 :return: 截图对象 """ screenshot = self.browser.get_screenshot_as_png() screenshot = Image.open(BytesIO(screenshot)) return screenshot def get_touclick_image(self, name='captcha.png'): """ 获取验证码图片 :return: 图片对象 """ top, bottom, left, right = self.get_position() print('验证码位置', top, bottom, left, right) screenshot = self.get_screenshot() captcha = screenshot.crop((left, top, right, bottom)) captcha.save(name) return captcha def get_points(self, captcha_result): """ 解析识别结果 :param captcha_result: 识别结果 :return: 转化后的结果 """ groups = captcha_result.get('pic_str').split('|') locations = [[int(number) for number in group.split(',')] for group in groups] return locations def touch_click_words(self, locations): """ 点击验证图片 :param locations: 点击位置 :return: None """ for location in locations: print(location) ActionChains(self.browser).move_to_element_with_offset(self.get_touclick_element(), location[0], location[1]).click().perform() time.sleep(1) def touch_click_verify(self): """ 点击验证按钮 :return: None """ button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'touclick-pub-submit'))) button.click() def login(self): """ 登录 :return: None """ submit = self.wait.until(EC.element_to_be_clickable((By.ID, '_submit'))) submit.click() time.sleep(10) print('登录成功') def crack(self): """ 破解入口 :return: None """ self.open() # 点击验证按钮 button = self.get_touclick_button() button.click() # 获取验证码图片 image = self.get_touclick_image() bytes_array = BytesIO() image.save(bytes_array, format='PNG') # 识别验证码 result = self.chaojiying.post_pic(bytes_array.getvalue(), CHAOJIYING_KIND) print(result) locations = self.get_points(result) self.touch_click_words(locations) self.touch_click_verify() # 判定是否成功 success = self.wait.until( EC.text_to_be_present_in_element((By.CLASS_NAME, 'touclick-hod-note'), '验证成功')) print(success) # 失败后重试 if not success: self.crack() else: self.login() if __name__ == '__main__': crack = CrackTouClick() crack.crack()
- 微博宫格验证码的识别
import os import time from io import BytesIO from PIL import Image from selenium import webdriver from selenium.common.exceptions import TimeoutException from selenium.webdriver import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from os import listdir USERNAME = '15874295385' PASSWORD = 'fpdpvx119' TEMPLATES_FOLDER = 'templates/' class CrackWeiboSlide(): def __init__(self): self.url = 'https://passport.weibo.cn/signin/login?entry=mweibo&r=https://m.weibo.cn/' self.browser = webdriver.Chrome() self.wait = WebDriverWait(self.browser, 20) self.username = USERNAME self.password = PASSWORD def __del__(self): self.browser.close() def open(self): """ 打开网页输入用户名密码并点击 :return: None """ self.browser.get(self.url) username = self.wait.until(EC.presence_of_element_located((By.ID, 'loginName'))) password = self.wait.until(EC.presence_of_element_located((By.ID, 'loginPassword'))) submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'loginAction'))) username.send_keys(self.username) password.send_keys(self.password) submit.click() def get_position(self): """ 获取验证码位置 :return: 验证码位置元组 """ try: img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'patt-shadow'))) except TimeoutException: print('未出现验证码') self.open() time.sleep(2) location = img.location size = img.size top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[ 'width'] return (top, bottom, left, right) def get_screenshot(self): """ 获取网页截图 :return: 截图对象 """ screenshot = self.browser.get_screenshot_as_png() screenshot = Image.open(BytesIO(screenshot)) return screenshot def get_image(self, name='captcha.png'): """ 获取验证码图片 :return: 图片对象 """ top, bottom, left, right = self.get_position() print('验证码位置', top, bottom, left, right) screenshot = self.get_screenshot() captcha = screenshot.crop((left, top, right, bottom)) captcha.save(name) return captcha def is_pixel_equal(self, image1, image2, x, y): """ 判断两个像素是否相同 :param image1: 图片1 :param image2: 图片2 :param x: 位置x :param y: 位置y :return: 像素是否相同 """ # 取两个图片的像素点 pixel1 = image1.load()[x, y] pixel2 = image2.load()[x, y] threshold = 20 if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs( pixel1[2] - pixel2[2]) < threshold: return True else: return False def same_image(self, image, template): """ 识别相似验证码 :param image: 待识别验证码 :param template: 模板 :return: """ # 相似度阈值 threshold = 0.99 count = 0 for x in range(image.width): for y in range(image.height): # 判断像素是否相同 if self.is_pixel_equal(image, template, x, y): count += 1 result = float(count) / (image.width * image.height) if result > threshold: print('成功匹配') return True return False def detect_image(self, image): """ 匹配图片 :param image: 图片 :return: 拖动顺序 """ for template_name in listdir(TEMPLATES_FOLDER): print('正在匹配', template_name) template = Image.open(TEMPLATES_FOLDER + template_name) if self.same_image(image, template): # 返回顺序 numbers = [int(number) for number in list(template_name.split('.')[0])] print('拖动顺序', numbers) return numbers def move(self, numbers): """ 根据顺序拖动 :param numbers: :return: """ # 获得四个按点 circles = self.browser.find_elements_by_css_selector('.patt-wrap .patt-circ') dx = dy = 0 for index in range(4): circle = circles[numbers[index] - 1] # 如果是第一次循环 if index == 0: # 点击第一个按点 ActionChains(self.browser) .move_to_element_with_offset(circle, circle.size['width'] / 2, circle.size['height'] / 2) .click_and_hold().perform() else: # 小幅移动次数 times = 30 # 拖动 for i in range(times): ActionChains(self.browser).move_by_offset(dx / times, dy / times).perform() time.sleep(1 / times) # 如果是最后一次循环 if index == 3: # 松开鼠标 ActionChains(self.browser).release().perform() else: # 计算下一次偏移 dx = circles[numbers[index + 1] - 1].location['x'] - circle.location['x'] dy = circles[numbers[index + 1] - 1].location['y'] - circle.location['y'] def crack(self): """ 破解入口 :return: """ self.open() # 获取验证码图片 image = self.get_image('captcha.png') numbers = self.detect_image(image) self.move(numbers) time.sleep(10) print('识别结束') if __name__ == '__main__': crack = CrackWeiboSlide() crack.crack()