zoukankan      html  css  js  c++  java
  • 爬虫相关基础知识梳理

    1.PyQuery
     1 from pyquery import PyQuery as pq
     2 
     3 # url初始化
     4 # html = ''
     5 # doc = pq(html)
     6 url = 'https://www.baidu.com'
     7 doc = pq(url=url)
     8 print(doc('head'))
     9 
    10 # 文件初始化
    11 doc = pq(filename='xxx.html')
    12 print(doc('li'))
    13 
    14 # 基本css选择器
    15 doc('#id .class a') # id 下 class 的 li 标签
    16 doc('#id .class.a') # 并列表示同级
    17 
    18 # 查找元素
    19 item = doc('.class')
    20 lis = item.find('list')  # 子元素
    21 lis = item.children() # 直接子元素 不常用
    22 lis = item.children('.class')
    23 
    24 lis = item.parent()   #父元素
    25 lis = item.parents()  #祖先节点
    26 lis = item.parents('li')
    27 
    28 item.siblings()  #兄弟节点
    29 item.siblings('')  #兄弟节点
    30 
    31 
    32 # 遍历
    33 lst = doc('li').items() #生成器
    34 for ls in lst:
    35     pass
    36 
    37 # 获取属性
    38 lis.attr('href')
    39 lis.attr.href
    40 
    41 # 获取文本
    42 lis.text()
    43 
    44 # 获取html
    45 lis.html()
    46 
    47 # dom操作
    48 lis.remove_class('.class')
    49 lis.add_class('.class')
    50 
    51 lis.attr('name','link')  # 加一个name='link' 的属性
    52 lis.css('font-size','14px') # 加一个css属性
    53 
    54 lis.find('p').remove()  # 删除p标签
    55 
    56 # 伪类选择器
    57 doc('li:first-child')  # 第一个元素
    58 doc('li:last-child')  # 最后一个元素
    59 doc('li:child(2)')  # 第二个元素
    60 doc('li:gt(2)')  # 第二个元素以上
    61 doc('li:nth-child(2n)')  # 偶数元素
    62 doc('li:contains(second)') # 包含second 文本的
    View Code

    2.requests

     1 import requests
     2 url = 'https://www.baidu.com'
     3 resp = requests.get(url)
     4 print(resp.cookies)
     5 print(resp.text)
     6 
     7 # get
     8 
     9 data = {
    10     '':'',
    11     '':''
    12 }
    13 resp = requests.get(url,params=data)
    14 # 解析json
    15 print(resp.json()) #print(json.loads(resp.text)
    16 
    17 # 获取二进制数据
    18 print(resp.content)
    19 with open('','wb') as f:
    20     f.write(resp.content)
    21 
    22 # 添加headers
    23 headers = {'User-Agent':''}
    24 resp = resp.get(url,headers=headers)
    25 
    26 
    27 
    28 # post
    29 data = {}
    30 resp = requests.post(url,data=data)
    31 resp = requests.post(url,data=data,headers=headers)
    32 
    33 
    34 # >>>高级操作
    35 #   1.文件上传
    36 files = {'file':open('','rb')}
    37 resp = requests.post(url,files=files)
    38 #   2.获取cookie4
    39 for key,value in resp.cookies.items():
    40     print(key+'='+value)
    41 #   3.会话维持
    42 import requests
    43 # requests.get('https://httpbin.org/cookies/set/number/12346789')
    44 # resp = requests.get('https://httpbin.org/cookies')
    45 s = requests.Session()
    46 s.get('https://httpbin.org/cookies/set/number/12346789')
    47 resp = s.get('https://httpbin.org/cookies')
    48 
    49 #   4.证书验证
    50 import requests
    51 resp = requests.get('https://www.12306.cn',verify=False)
    52 resp = requests.get('https://www.12306.cn',cert=('/path/server,crt','/path/key'))
    53 
    54 #   5.代理设置
    55 import requests
    56 proxies = {
    57     'http':'http://127.0.0.1:9473',
    58     'https':'https://127.0.0.1:9473',
    59     'http':'http://uesr:password@127.0.0.1:9473' #带有用户名密码的代理
    60 }
    61 resp = requests.get(url,proxies=proxies)
    62 
    63 
    64 #   6.认证设置
    65 import requests
    66 from requests.auth import HTTPBasicAuth
    67 resp = requests.get(url,auth=HTTPBasicAuth('user','123'))
    68 
    69 
    70 import requests
    71 resp = requests.get(url,auth=('',''))
    72 
    73 
    74 #   7.异常处理
    75 from requests.exceptions import ReadTimeout,ConnectionError,RequestException
    76 try:
    77     pass
    78 except ReadTimeout:
    79     pass
    80 except ConnectionError:
    81     pass
    82 except RequestException:
    83     pass
    View Code

    4.selenium

      1 from selenium import webdriver
      2 from selenium.webdriver.common.by import By
      3 from selenium.webdriver.common.keys import Keys
      4 from selenium.webdriver.support import expected_conditions as EC
      5 from selenium.webdriver.support.wait import WebDriverWait as wdw
      6 
      7 url = 'https://www.baidu.com'
      8 browser = webdriver.Chrome()
      9 try:
     10     browser.get(url)
     11     input = browser.find_element_by_id('kw')
     12     input.send_keys('Python')
     13     input.send_keys(Keys.ENTER)
     14    # input.clear()
     15     wait = wdw(browser,10)
     16     wait.until(EC.presence_of_element_located((By.ID,'content_left')))
     17     print(browser.current_url)
     18     print(browser.get_cookies())
     19     print(browser.page_source)
     20 finally:
     21     browser.close()
     22 
     23 
     24 # 声明浏览器对象
     25 browser = webdriver.Chrome()
     26 browser = webdriver.Firefox()
     27 browser = webdriver.Edge()
     28 browser = webdriver.PhantomJS()
     29 browser = webdriver.Safari()
     30 
     31 # 查找元素
     32 browser.find_element_by_id('q')
     33 browser.find_element_by_css_selector('#q')
     34 browser.find_element_by_xpath('//*[@id="q"]')
     35 browser.find_element('By.ID','q')
     36 
     37 # 多个元素
     38 browser.find_elements(By.CSS_SELECTOR,'.class li')
     39 browser.find_elements_by_css_selector('.class li')
     40 
     41 # 元素交互操作
     42 button = browser.find_element_by_class_name('')
     43 button.click()
     44 
     45 # 交互动作
     46 from selenium.webdriver import ActionChains
     47 browser = webdriver.Chrome()
     48 url = ''
     49 browser.get(url)
     50 browser.switch_to('')
     51 source = browser.find_element_by_css_selector('#')
     52 target = browser.find_element_by_css_selector('#')
     53 actions = ActionChains(browser)
     54 actions.drag_and_drop(source,target)
     55 actions.perform()
     56 
     57 # 执行javaScript
     58 browser.execute_script('alert()')
     59 
     60 # 获取元素信息
     61 logo = browser.find_element_by_css_selector('#')
     62 logo.get_attribute('class')
     63 
     64 # 获取文本值
     65 logo.text()
     66 
     67 # 获取id 位置  标签名 大小 
     68 
     69 logo.location
     70 logo.id
     71 logo.tag_name
     72 logo.size
     73 
     74 # Frame
     75 from selenium.common.exceptions import NoSuchElementException
     76 
     77 browser.switch_to.frame('')
     78 browser.switch_to.parent_frame('')
     79 
     80 # 等待
     81 # 隐式等待
     82 browser.implicitly_wait(10) # 超出10秒 异常
     83 
     84 # 显示等待 常用 
     85 wait = wdw(browser,)
     86 wait.until(EC.presence_of_element_located((By.ID,'q')))
     87 wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'')))
     88 
     89 
     90 # 前进 后退
     91 browser.back()
     92 browser.forward()
     93 
     94 # cookies
     95 browser.get_cookie()
     96 browser.add_cookie()
     97 browser.delete_all_cookies()
     98 
     99 # 选项卡管理
    100 browser.execute_script('windows.open()')  # 打开
    101 browser.switch_to_window(browser.window_handles[1]) #切换
    102 
    103 # 异常处理
    104 from selenium.common.exceptions import TimeoutException,NoSuchElementException
    105 try:
    106     pass
    107 except TimeoutException:
    108     pass
    109 except NoSuchElementException:
    110     pass
    111 finally:
    112     browser.close()
    View Code

    4.re

     1 import re
     2 
     3 #  match--- 从起始位置开始匹配
     4 content = 'Hello 123 4567 World_This is a Ragex Demo'
     5 result = re.match('^Hellosdddsd{4}sw{10}.*$',content)
     6 print(result)
     7 print(result.group())
     8 print(result.span())
     9 
    10 
    11 #复用
    12 pattern = re.compile('^Hello.*Demo$',re.S)
    13 result = re.match(pattern,content)
    14 
    15 #  泛匹配
    16 result = re.match('^Hello.*Demo$',content)
    17 # (d+)  .不能匹配换行符
    18 # 贪婪匹配 .*   非贪婪匹配 .?
    19 
    20 
    21 # search--- 返回第一个成功匹配的内容
    22 # findall---  以列表形式返回能全部匹配的目标
    23 # sub ---       替换
    24 # compile ---   将正则字符串编译成正则表达式
    25 
    26 # 尽量使用泛匹配,使用()得到匹配目标,尽量使用非贪婪模式,有换行符用re.S,能用search就不用match
    27 
    28 
    29 
    30 print('实战'+20*'-')
    31 import requests
    32 content = requests.get('https://book.douban.com/').text
    33 print(content)
    View Code

    5.urllib

      1 '''
      2 >>>urllib库
      3    ---urllib.request   请求模块
      4    ---urllib.error     异常处理模块
      5    ---urllib.parse     url解析模块
      6    ---(urllib.robotparser    robots.txt解析模块) - --非重点
      7 '''
      8 
      9 url = 'https://www.baidu.com'
     10 
     11 ### get请求
     12 import urllib.request
     13 resp = urllib.request.urlopen(url)
     14 print(resp.read().decode('utf-8'))
     15 
     16 ### post请求
     17 import urllib.parse
     18 import urllib.request
     19 data = bytes(urllib.parse.urlencode({'word':'hello'},encoding='utf8'))
     20 resp = urllib.request.urlopen(url, data=data)
     21 print(resp.read())
     22 
     23 ### 异常
     24 import urllib.request
     25 import urllib.error
     26 import socket
     27 try:
     28     resp = urllib.request.urlopen(url, timeout=0.1)
     29 except urllib.error.URLError as e:
     30     if isinstance(e.reason, socket.timeout):
     31         print('TIME OUT')
     32 
     33 ### 响应
     34 resp.status
     35 resp.getheaders()
     36 resp.getheaders('Server')
     37 
     38 ### 加参数
     39 ### Request
     40 import urllib.request
     41 request = urllib.request.Request(url)
     42 resp = urllib.request.urlopen(request)
     43 
     44 
     45 from urllib import request, parse
     46 headers = {
     47     'User-Agent': ''}  # request.add_header('','')
     48 data = {'': ''}
     49 req = request.Request(url=url, data=data, headers=headers, method='POST')
     50 resp = request.urlopen(req)
     51 
     52 ### 代理(handler)
     53 from urllib import request
     54 
     55 proxy_handler = request.ProxyHandler({
     56     'http': '//xxx.x.x.x:xxxx',
     57     'https':'//xxx.x.x.x: xxxx'
     58 })
     59 opener = request.bulid_opener(proxy_handler)
     60 resp = opener.open(url)
     61 
     62 ### cookie
     63 import http.cookiejar, urllib.request
     64 cookie = http.cookiejar.CookieJar()
     65 handler = urllib.request.HTTPCookieProcessor(cookie)
     66 opener = urllib.request.bulid_opener(handler)
     67 resp = opener.open(url)
     68 for item in cookie:
     69     print(item.name + '=' + item.value)
     70 
     71 
     72 import http.cookiejar,urllib.request
     73 filename ='cookie.txt'
     74 cookie = http.cookiejar.MozillaCookieJar(filename) #http.cookiejar.LWPCookieJar(filename)
     75 handler = urllib.request.HTTPCookieProcessor(cookie)
     76 opener = urllib.request.build_opener(handler)
     77 resp = opener.open(url)
     78 cookie.save(ignore_discard=True,ignore_expires=True)
     79 
     80 
     81 import http.cookiejar,urllib.request
     82 cookie = http.cookiejar.LWPCookieJar()
     83 cookie.load('cookie.txt',ignore_expires=True,ignore_discard=True)
     84 handler = urllib.request.HTTPCookieProcessor(cookie)
     85 opener = urllib.request.build_opener(handler)
     86 resp = opener.open(url)
     87 
     88 
     89 
     90 
     91 # **** 重点 ****
     92 # urlencode
     93 from urllib.parse import urlencode
     94 params = {
     95     '':'',
     96     '':''
     97 }
     98 base_url = 'https://www.baidu.com?'
     99 url = base_url + urlencode(params)
    100 
    101 # url解析
    102 from urllib.parse import urlparse
    103 result = urlparse(url)  #协议 默认https
    104 result = urlparse(url,scheme='https')
    105 result = urlparse(url,allow_fragments=False)
    View Code

    View Code 

    6.xpath

    7.bs

     1 from bs4 import BeautifulSoup as bs
     2 url = ''
     3 soup = bs(url,'lxml')
     4 soup.prettify() # 格式化
     5 soup.title.string  # title标签的内容
     6 
     7 
     8 #>>>>标签选择器
     9 # 选择元素
    10 print(soup.head) # head标签
    11 print(soup.p)   # p标签
    12 # 获取名称
    13 soup.title.name   # --->title
    14 # 获取属性
    15 soup.p.attrs('name')   # p标签的name属性
    16 soup.p('name')
    17 # 获取内容
    18 soup.p.string   # p标签的内容
    19 # 嵌套选择
    20 soup.head.title.string   # head标签下的title标签的内容
    21 # 子孙节点
    22 soup.p.contents   # p标签的子节点
    23 soup.p.children   # 以迭代器的方式返回子节点
    24 for i,child in enumerate(soup.p.children):
    25     print(i,child)
    26 soup.p.descendants   # 子孙节点 迭代器
    27 # 父节点/祖先节点
    28 soup.p.parent    # 父节点
    29 soup.p.parents  # 祖先节点
    30 # 兄弟节点
    31 soup.a.next_siblings
    32 soup.a.previous_siblings
    33 
    34 
    35 #>>>> 标准选择器
    36 # find_all
    37 soup.find_all('li')  # 所有的li标签
    38 soup.find_all('li')[0]  # 第一个li标签
    39 # attrs
    40 soup.find_all(attrs={'id':'list_a'}) # id = list_a 的所有标签
    41 soup.find_all(id='list_a') # id = list_a 的所有标签
    42 soup.find_all(class_='list_a') # class = list_a 的所有标签
    43 # text
    44 soup.find_all(text='')  # 标签文本内容
    45 
    46 #find_next_siblings()
    47 #find_next_sibling()
    48 #find_previous_siblings()
    49 #find_previous_sibling()
    50 #find_next()
    51 #find_previous()
    52 #find_all_next()
    53 #find_all_previous()
    54 
    55 #>>>css选择器
    56 # select
    57 soup.select('.class_ .class_1')
    58 soup.select('ul li')
    59 tar = soup.select('#id .class_') #id=''下的 class=''的标签
    60 soup.select('ul')[0]
    61 # 获取属性
    62 tar['']
    63 tar.attrs['']
    64 # 获取内容
    65 tar.get_text()
    View Code
  • 相关阅读:
    jQuery——能够编辑的表格
    最简单的Windows程序
    数据库分页查询
    Srvctl命令具体解释(10g)
    AT3912 Antennas on Tree
    使用GenericServlet实例
    Servlet接口、GenericServlet类、HttpServlet类
    Servlet简介
    MVC
    为JSP写的一套核心标签
  • 原文地址:https://www.cnblogs.com/ray-mmss/p/9375960.html
Copyright © 2011-2022 走看看