zoukankan      html  css  js  c++  java
  • 爬虫请求相关

    urllib.request

    import urllib.request
    
    url = 'http://www.baidu.com/'
    response = urllib.request.urlopen(url).read()
    
    #使用urllib.request.urlopen可以读取网页内容
    import urllib.request
    
    url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1543822283&di=b327e6e2dc59105bcb73a174bff94919&imgtype=jpg&er=1&src=http%3A%2F%2Ftupian.qqjay.com%2Fu%2F2017%2F1201%2F2_161641_2.jpg'
    res = urllib.request.urlopen(url)
    
    # 图片下载方式一
    with open('fengjing.jpg','wb') as f:
        f.write(res.read())
    
    # 图片下载方式二
    urllib.request.urlretrieve(url,'tupian.jpg')
    
    #使用urllib.request.urlretrieve可以将图片直接下载到指定的路径
    import urllib.request
    
    url = 'http://www.baidu.com/'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
        'Accept-Language':'zh-CN,zh;q=0.9'
    }
    
    # 构建请求对象
    request = urllib.request.Request(url=url,headers=headers)

    #使用urllib.request.Request可以构建带请求头的请求对象
    import urllib.request
    
    handler = urllib.request.ProxyHandler({'http': '124.243.226.18:8888'})
    opener = urllib.request.build_opener(handler)
    url = 'http://www.baidu.com/s?wd=IP'
    headers = {
        "Host": "www.baidu.com",
        "Connection": "keep-alive",
        "Cache-Control": "max-age=0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36",
        "Accept-Language": "zh-CN,zh;q=0.8",
    }
    request = urllib.request.Request(url=url,headers=headers)
    response = opener.open(request)
    print(response.read().decode())
    
    #给程序设置代理

    urllib.parse

    import urllib.parse
    
    res = urllib.parse.urlparse(url)
    print(res)
    
    #输出结果:
    ParseResult(scheme='http', netloc='www.baidu.com', path='/s', params='', query='wd=%E7%9F%A5%E4%B9%8E', fragment='')
    
    #使用 urllib.parse.urlparse可以分割url
    import urllib.parse
    
    word = '知乎'
    
    res = urllib.parse.quote(word)
    print(res)
    
    res = urllib.parse.unquote(res)
    print(res)
    
    #输出结果:
    %E7%9F%A5%E4%B9%8E
    知乎
    
    #使用urllib.parse.quote和urllib.parse.unquote可以对字符进行编码
    import urllib.parse
    
    params = {
        'wd':'知乎'
    }
    word = urllib.parse.urlencode(params)
    print(word)
    
    #输出结果:
    wd=%E7%9F%A5%E4%B9%8E
    
    #使用urllib.parse.urlencode可以把一个字典编码成以上格式

    requests

    import requests
    
    response = requests.get('http://www.baidu.com/')
    print(response.url)
    print(response.text)
    print(response.status_code)
    print(response.headers)
    print(response.cookies)
    print(response.content.decode())
    基本方法
    http://www.baidu.com/
    <!DOCTYPE html>
    <!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8><meta http-equiv=X-UA-Compatible content=IE=Edge><meta content=always name=referrer><link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css><title>百度一下,你就知道</title></head> <body link=#0000cc> <div id=wrapper> <div id=head> <div class=head_wrapper> <div class=s_form> <div class=s_form_wrapper> <div id=lg> <img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129> </div> <form id=form name=f action=//www.baidu.com/s class=fm> <input type=hidden name=bdorz_come value=1> <input type=hidden name=ie value=utf-8> <input type=hidden name=f value=8> <input type=hidden name=rsv_bp value=1> <input type=hidden name=rsv_idx value=1> <input type=hidden name=tn value=baidu><span class="bg s_ipt_wr"><input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus></span><span class="bg s_btn_wr"><input type=submit id=su value=百度一下 class="bg s_btn"></span> </form> </div> </div> <div id=u1> <a href=http://news.baidu.com name=tj_trnews class=mnav>新闻</a> <a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a> <a href=http://map.baidu.com name=tj_trmap class=mnav>地图</a> <a href=http://v.baidu.com name=tj_trvideo class=mnav>视频</a> <a href=http://tieba.baidu.com name=tj_trtieba class=mnav>贴吧</a> <noscript> <a href=http://www.baidu.com/bdorz/login.gif?login&amp;tpl=mn&amp;u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>登录</a> </noscript> <script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb">登录</a>');</script> <a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;">更多产品</a> </div> </div> </div> <div id=ftCon> <div id=ftConw> <p id=lh> <a href=http://home.baidu.com>关于百度</a> <a href=http://ir.baidu.com>About Baidu</a> </p> <p id=cp>&copy;2017&nbsp;Baidu&nbsp;<a href=http://www.baidu.com/duty/>使用百度前必读</a>&nbsp; <a href=http://jianyi.baidu.com/ class=cp-feedback>意见反馈</a>&nbsp;京ICP证030173号&nbsp; <img src=//www.baidu.com/img/gs.gif> </p> </div> </div> </div> </body> </html>
    
    200
    {'Cache-Control': 'private, no-cache, no-store, proxy-revalidate, no-transform', 'Connection': 'Keep-Alive', 'Content-Encoding': 'gzip', 'Content-Type': 'text/html', 'Date': 'Mon, 03 Dec 2018 13:39:39 GMT', 'Last-Modified': 'Mon, 23 Jan 2017 13:27:32 GMT', 'Pragma': 'no-cache', 'Server': 'bfe/1.0.8.18', 'Set-Cookie': 'BDORZ=27315; max-age=86400; domain=.baidu.com; path=/', 'Transfer-Encoding': 'chunked'}
    <RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>
    <!DOCTYPE html>
    <!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8><meta http-equiv=X-UA-Compatible content=IE=Edge><meta content=always name=referrer><link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css><title>百度一下,你就知道</title></head> <body link=#0000cc> <div id=wrapper> <div id=head> <div class=head_wrapper> <div class=s_form> <div class=s_form_wrapper> <div id=lg> <img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129> </div> <form id=form name=f action=//www.baidu.com/s class=fm> <input type=hidden name=bdorz_come value=1> <input type=hidden name=ie value=utf-8> <input type=hidden name=f value=8> <input type=hidden name=rsv_bp value=1> <input type=hidden name=rsv_idx value=1> <input type=hidden name=tn value=baidu><span class="bg s_ipt_wr"><input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus></span><span class="bg s_btn_wr"><input type=submit id=su value=百度一下 class="bg s_btn"></span> </form> </div> </div> <div id=u1> <a href=http://news.baidu.com name=tj_trnews class=mnav>新闻</a> <a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a> <a href=http://map.baidu.com name=tj_trmap class=mnav>地图</a> <a href=http://v.baidu.com name=tj_trvideo class=mnav>视频</a> <a href=http://tieba.baidu.com name=tj_trtieba class=mnav>贴吧</a> <noscript> <a href=http://www.baidu.com/bdorz/login.gif?login&amp;tpl=mn&amp;u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>登录</a> </noscript> <script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb">登录</a>');</script> <a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;">更多产品</a> </div> </div> </div> <div id=ftCon> <div id=ftConw> <p id=lh> <a href=http://home.baidu.com>关于百度</a> <a href=http://ir.baidu.com>About Baidu</a> </p> <p id=cp>&copy;2017&nbsp;Baidu&nbsp;<a href=http://www.baidu.com/duty/>使用百度前必读</a>&nbsp; <a href=http://jianyi.baidu.com/ class=cp-feedback>意见反馈</a>&nbsp;京ICP证030173号&nbsp; <img src=//www.baidu.com/img/gs.gif> </p> </div> </div> </div> </body> </html>
    运行结果
    '''
    目标网站:
        http://www.shicimingju.com/book/sanguoyanyi.html
    代码思路:
        1、确定url
        2、伪装浏览器信息
        3、获取request
        4、获取url指向页面的内容
        5、解析获取到的网页,获取标题标题和文章链接
        6、下载文章内容到本地
    '''
    
    import urllib.request
    import urllib.parse
    from bs4 import BeautifulSoup
    import requests
    import lxml
    
    
    def handle_requests(url):
        headers = {
            'User-Agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
        }
        response = requests.get(url,headers)
        return response
    
    
    def download_text(title, href):
        response = handle_requests(href)
        content = response.text
        soup = BeautifulSoup(content,'lxml')
        res = soup.find('div',class_ = 'chapter_content').find_all('p')
        f = open('hongloumeng.txt', 'a',encoding='utf-8')
        f.write(title)
        print(title)
        for p in res:
            f.write(p.text)
        f.close()
    
    
    def parse_content(content):
        soup = BeautifulSoup(content,'lxml')
        res = soup.select('.book-mulu > ul > li > a')
        for i in res:
            title = i.text
            href = 'http://www.shicimingju.com' + i['href']
            download_text(title,href)
            print('正在下载。。。')
    
    
    def main():
        url = 'http://www.shicimingju.com/book/hongloumeng.html'
        # 获取包装头信息,获取request对象
        response = handle_requests(url)
        # 根据得到的request发送请求,获取页面所有内容
        content = response.text
        # 解析页面内容,获取文章标题和内容链接
        parse_content(content)
    
    if __name__ == '__main__':
        main()
    requests爬取小说
    import requests
    
    url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
    data = {
        'cname':'',
        'pid':'',
        'keyword':'杭州',
        'pageIndex':'1',
        'pageSize':'10',
    }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
    }
    
    response = requests.post(url,data,headers=headers)
    print(response.text)
    requests的post请求

    selenium

    selenium最初是一个测试工具,而爬虫中使用它主要是为了解决requests无法直接执行JavaScript代码的问题

    selenium本质是通过驱动浏览器,完全模拟浏览器的操作,比如跳转、输入、点击、下拉等,来拿到网页渲染之后的结果,可支持多种浏览器

    import requests
    from selenium import webdriver
    import time
    from bs4 import BeautifulSoup
    
    path = r'D:Python1806spiderday5chromedriver.exe'
    url = 'https://so.gushiwen.org/user/login.aspx'
    browser = webdriver.Chrome(executable_path=path)
    
    # 指定url
    browser.get(url)
    time.sleep(2)
    
    # 找到用户名和密码等输入项
    username = browser.find_element_by_xpath('//input[@id="email"]')
    username.send_keys('wusir666666@163.com')
    time.sleep(1)
    
    pwd = browser.find_element_by_xpath("//input[@id='pwd']")
    pwd.send_keys('ymmnxhwm13579')
    time.sleep(1)
    
    soup = BeautifulSoup(browser.page_source,'lxml')
    browser.save_screenshot('login.png')
    code = input("请输入验证码:")
    checkcode = browser.find_element_by_xpath("//input[@id='code']")
    checkcode.send_keys(code)
    
    login = browser.find_element_by_xpath("//input[@id='denglu']")
    login.click()
    selenium基础使用
  • 相关阅读:
    单例模式
    HashSet、LinkedHashSet、SortedSet、TreeSet
    ArrayList、LinkedList、CopyOnWriteArrayList
    HashMap、Hashtable、LinkedHashMap
    andrew ng machine learning week8 非监督学习
    andrew ng machine learning week7 支持向量机
    andrew ng machine learning week6 机器学习算法理论
    andrew ng machine learning week5 神经网络
    andrew ng machine learning week4 神经网络
    vue组件监听属性变化watch方法报[Vue warn]: Method "watch" has type "object" in the component definition. Did you reference the function correctly?
  • 原文地址:https://www.cnblogs.com/wusir66/p/10061216.html
Copyright © 2011-2022 走看看