zoukankan      html  css  js  c++  java
  • Python_Crawler_04_Libraries_Req_BS4_HTMLParser_DB_Login

     Python Lib Requests,BS4,HTML parser, Data Base, Login

    Requests + Beatifulsoup

    bs4, Beatifulsoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#

     1 import requests
     2 from bs4 import BeautifulSoup
     3 from lxml import etree
     4 
     5 # get target contents
     6 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
     7 cookies = {'Cookie': 'csrftoken=D5Vm2O0Ftf0K5b9zice10y3ZntK67YxM; sessionid=7390f8f82b756e834a101028c3398173; django_language=zh-CN; bb_language=zh-CN'}
     8 url = 'https://www.yianyouxuan.com/mall'
     9 try:
    10     res = requests.get(url,cookies=cookies,headers=headers)
    11     res.raise_for_status()
    12     res.encoding = res.apparent_encoding
    13     html_content=res.content
    14 except:
    15     print('request failure')
    16 
    17 
    18 
    19 #### BeautifulSoup ####
    20 soup = BeautifulSoup(html_content,'html.parser')
    21 #print(soup.prettify)
    22 prod_list = soup.select('#mall-list > li > div > a')    # original copy "#mall-list > li:nth-child(1) > div > a"
    23 creadit_list = soup.select('#mall-list > li > p > em')  # original copy "#mall-list > li:nth-child(1) > p:nth-child(3) > em"
    24 prod_list_title = []
    25 creadits = []
    26 for item in prod_list:
    27     #print(item.attrs)
    28     prod_list_title.append(item.attrs['title'])
    29 for item in creadit_list:
    30     creadits.append(int(item.string))    # or item.text to get text.
    31 print(prod_list_title)
    32 print(creadits)
    33 
    34 
    35 ##### Xpath ####
    36 root = etree.HTML(html_content)
    37 items = root.xpath('//*[@id="mall-list"]/li')    #original copy //*[@id="mall-list"]/li[1]. need to remove [1], li is a list. 
    38 #print(len(items))
    39 prod_name = []
    40 creadit = []
    41 for item in items:
    42     prod_name.append(str(item.xpath('./h4/a/text()')[0]))    #original copy //*[@id="mall-list"]/li[1]/h4/a
    43     creadit.append(int(item.xpath('./p/em[@class="c_tx1"]/text()')[0])) # original copy //*[@id="mall-list"]/li[1]/p[1]/em
    44 print(prod_name)
    45 print(creadit) 

    #Outcomes:

    一、Requests

    Examples:

    request.get()

     1 import requests
     2 
     3 URL = 'http://www.baidu.com'
     4 try:
     5     Res = requests.get(URL)
     6     Res.raise_for_status()        #tell if it's true or false. true -> continue; false -> except.
     7     print(Res.status_code)
     8     print(Res。raise_for_status)
     9     print(Res.encoding)
    10     print(Res.apparent_encoding)
    11     Res.encoding = Res.apparent_encoding    #让显示编码赋值给编码,确保显示正常。
    12 except:
    13     print('request failure')

    #Outcome:

    200
    <bound method Response.raise_for_status of <Response [200]>>
    ISO-8859-1
    utf-8
    <!DOCTYPE html>
    <!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8>
    <meta http-equiv=X-UA-Compatible content=IE=Edge><meta content=always name=referrer>
    <link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css>
    <title>百度一下,你就知道</title></head>  

    After assign the value of Res.apparent_encoding to Res.encoding

    >>> Res.text[0:1000]
    '<!DOCTYPE html>
    <!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8>
    <meta http-equiv=X-UA-Compatible content=IE=Edge><meta content=always name=referrer>
    <link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css>
    <title>百度一下,你就知道</title></head>

    Requests.heard()方法

    Requests.post() 方法

    1 >>> payload = {'key1': 'value1', 'key2': 'value2'}    
    2 >>> r = requests.post('http://httpbin.org/post', data = payload)  #传递参数:比如http://xxx?k1=v1&k2=v2
    3 >>> print(r.text)
    4 { ...            #向URL POST一个字典,自动编码为form(表单)
    5 "form": {  
    6 "key2": "value2",
    7 "key1": "value1"
    8 },
    9 }
    10 >>> r = requests.post('http://httpbin.org/post', data = 'ABC')
    11 >>> print(r.text)
    12 { ...
    13 "data": "ABC"
    14 "form": {},
    15 }

    Requests.put() 方法  

    1 >>> payload = {'key1': 'value1', 'key2': 'value2'}
    2 >>> r = requests.put('http://httpbin.org/put', data = payload)
    3 >>> print(r.text)
    4 { ...
    5 "form": {
    6 "key2": "value2",
    7 "key1": "value1"
    8 },
    9 }

    #传递参数,比如http://aaa.com?pageId=1&type=content

     1   import requests
     2   
     3   URL = 'https://www.amazon.cn/dp/B01JG4J5PQ/ref=lp_2134663051_1_1?'
     4   Params = {'s':'grocery','ie':'UTF8','qid':'1517822277','sr':'1-1'}
     5   try:
     6       Res = requests.get(URL,Params)
     7       Res.raise_for_status
     8       Res.encoding = Res.apparent_encoding
     9       print(Res.text[0:1000])
    10  except:
    11      print('failure')

    #二进制处理数据 

     1 import requests
     2 from PIL import Image
     3 from io import BytesIO
     4 
     5 URL = 'https://images-cn.ssl-images-amazon.com/images/I/71GGUs7JSXL._SL1000_.jpg'
     6 
     7 try:
     8     Res = requests.get(URL)
     9     Image01 = Image.open(BytesIO(Res.content))
    10     Image01.save('nuts.jpg')
    11 except:
    12     print('failure')

     #json处理

    1 import requests
    2 import json
    3 
    4 res = requests.get('https://github.com/timeline.json')
    5 print(type(res.json))
    6 print(res.text)

     #原始数据处理

    1 URL = 'https://images-cn.ssl-images-amazon.com/images/I/71GGUs7JSXL._SL1000_.jpg'
    2 
    3 Res = requests.get(URL)
    4 
    5 with open('nuts2.jpg','wb+') as f:
    6     for chunk in Res.iter_content(1024):
    7         f.write(chunk)

     

    #提交表单

    import json
    import requests
    
    form = {'username':'user','password':'pass'}
    r01 = requests.post('http://httpbin.org/post',data=form)
    print(r01.text)
    r02 = requests.post('http://httpbin.org/post',data=json.dumps(form))
    print(r02.text)

    #Outcome:

    {
      "args": {}, 
      "data": "", 
      "files": {}, 
      "form": {
        "password": "pass", 
        "username": "user"
      }, 
      "headers": {
        "Accept": "*/*", 
        "Accept-Encoding": "gzip, deflate", 
        "Connection": "close", 
        "Content-Length": "27", 
        "Content-Type": "application/x-www-form-urlencoded", 
        "Host": "httpbin.org", 
        "User-Agent": "python-requests/2.9.1"
      }, 
      "json": null, 
      "origin": "60.247.94.218", 
      "url": "http://httpbin.org/post"
    }
    
    {
      "args": {}, 
      "data": "{"password": "pass", "username": "user"}", 
      "files": {}, 
      "form": {}, 
      "headers": {
        "Accept": "*/*", 
        "Accept-Encoding": "gzip, deflate", 
        "Connection": "close", 
        "Content-Length": "40", 
        "Host": "httpbin.org", 
        "User-Agent": "python-requests/2.9.1"
      }, 
      "json": {
        "password": "pass", 
        "username": "user"
      }, 
      "origin": "60.247.94.218", 
      "url": "http://httpbin.org/post"
    }
    

    Cookie   

     1 import requests
     2 
     3 url = 'http://www.baidu.com'
     4 r = requests.get (url)
     5 cookie = r.cookies
     6 for k,v in cookie.get_dict().items():
     7     print(k,v)
     8 
     9 cookies = {'c1':'v1','c2':'v2'}
    10 r = requests.get('http://httpbin.org/cookies',cookies=cookies)
    11 print(r.text)

    #outcome:

    BDORZ 27315
    {
    "cookies": {
    "c1": "v1",
    "c2": "v2"
    }
    }

      

    #Redirect and Redirect history

    1 import requests
    2 
    3 res = requests.head('http://github.com', allow_redirects = True)
    4 print(res.url)
    5 print(res.status_code)
    6 print(res.history)

    #outcome

    https://github.com/
    200
    [<Response [301]>]
    

    #代理

    1 proxies = {'http':' ','https':' '}
    2 r = requests.get('...',proxies = proxies)

    二、BS4 (Beautiful Soup)

     

     1. Beautiful Soup 解析器

     

    使用和没有使用解析器: 

    1 from bs4 import BeautifulSoup
    2 
    3 soup = BeautifulSoup(open('test.html'))
    4 print(soup.prettify())
    without html.parser

    #outcome:

    /usr/lib/python3/dist-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.
    
    To get rid of this warning, change this:
    
     BeautifulSoup([your markup])
    
    to this:
    
     BeautifulSoup([your markup], "lxml")
    
      markup_type=markup_type))
    <html>
    ...
    </html>
    

      

    1 from bs4 import BeautifulSoup
    2 
    3 soup = BeautifulSoup(open('test.html'),'html.parser')  #指定了使用html解析器。
    4 print(soup.prettify())
    with html.parser

     #outcome:

    <html>
     <head>
      <title>
       The Dormouse's story
      </title>
     </head>
     <body>
      <p class="title" name="dromouse">
       <b>
        The Dormouse's story
       </b>
      </p>
      <p class="story">
       Once upon a time there were three little sisters; and their names were
       <a class="sister" href="http:/example.com/elsie" id="link1">
        <!-- Elsie -->
       </a>
       ,
       <a class="sister" href="http:/example.com/elsie" id="link2">
        Lacie
       </a>
       and
       <a class="sister" href="http:/example.com/elsie" id="link3">
        Tillie
       </a>
       ;
    and they lived at the bottom of a well.
      </p>
      <p class="story">
       ...
      </p>
     </body>
    </html>
    

    2. Beautiful Soup类的基本元素

      Examples:

     1 from bs4 import BeautifulSoup
     2 
     3 soup = BeautifulSoup(open('test.html'),'html.parser')
     4 #print(soup.prettify())
     5 
     6 #Tag
     7 print(type(soup.title))
     8 print(soup.title.name)
     9 print(soup.title)
    10 
    11 #String
    12 print(type(soup.title.string))
    13 print(soup.title.string)
    14 
    15 #comment
    16 print(type(soup.a.string))
    17 print(soup.a.string)
    18 
    19 #Traverse
    20 for items in soup.body.contents:         #body的子元素是3P
    21     print(items.name)     
    22   print(items)          # To show all contents of the HTML document.

    #outcomes:

    <class 'bs4.element.Tag'>
    title
    <title>The Dormouse's story</title>
    <class 'bs4.element.NavigableString'>        #虽然打印显示都是字符串,但注意string和comment类型的不同。
    The Dormouse's story
    <class 'bs4.element.Comment'>
     Elsie 
    None
    p
    None
    p
    None
    p
    

    Example2:

     1 from bs4 import BeautifulSoup
     2 import requests
     3 
     4 def getHTML(req_url):
     5     req_header = {'user-agent':'Mozilla/5.0'}   # pretend to open with Mozila
     6     try:
     7         req_result= requests.get(req_url,req_header)
     8         req_result.raise_for_status()           # to check the status of the request
     9         req_result.encoding = req_result.apparent_encoding  # using apparent encoding.
    10         return (req_result.text)
    11     except:
    12         print("HTML ERROR")
    13 
    14 def HTMLParser(req_url):
    15     req_text = getHTML(url)
    16     soup = BeautifulSoup(req_text, 'html.parser')   #use html.parser to avoid some warnings.
    17     return(soup)
    18 
    19 def HTMLOPS(req_url):
    20     soup = HTMLParser(url)
    21 # title tag
    22     title = soup.title
    23     print(type(title))
    24     print(title)
    25 # body tag
    26 # To show the all contents of an HTML document
    27     print(soup.body.contents)
    28     print(soup.title.string)
    29 #OR
    30     for item in soup.body.contents:
    31         print(item)
    32 # a tag
    33     print(type(soup.a.string))      #To show the first a tag
    34     print(soup.a.string)
    35 # tag name
    36     print(soup.a.name)
    37     print(soup.a.parent.name)
    38 
    39 #tag attrs
    40     print(soup.a.attrs)
    41     print(soup.a.attrs['name'])
    42     #print(soup.div)
    43 
    44 if __name__ == '__main__':
    45     url = input('please input the url you want to parser: ')
    46     HTMLOPS(url)
    47     #print(soup.prettify())

    #Outcome:

     1 please input the url you want to parser: http://www.baidu.com
     2 <--title-->
     3 <class 'bs4.element.Tag'>
     4 <title>百度一下 你就知道</title>
     5 
     6 <--body contents-->
     7 [' ', <div id="wrapper"> <div id="head"> <div class="head_wrapper"> <div class="s_form"> <div class="s_form_wrapper"> <div id="lg"> <img height="129" hidefocus="true" src="//www.baidu.com/img/bd_logo1.png" width="270"> </img></div> <form action="//www.baidu.com/s" class="fm" id="form" name="f"> <input name="bdorz_come" type="hidden" value="1"> <input name="ie" type="hidden" value="utf-8"> <input name="f" type="hidden" value="8"> <input name="rsv_bp" type="hidden" value="1"> <input name="rsv_idx" type="hidden" value="1"> <input name="tn" type="hidden" value="baidu"><span class="bg s_ipt_wr"><input autocomplete="off" autofocus="" class="s_ipt" id="kw" maxlength="255" name="wd" value=""/></span><span class="bg s_btn_wr"><input class="bg s_btn" id="su" type="submit" value="百度一下"/></span> </input></input></input></input></input></input></form> </div> </div> <div id="u1"> <a class="mnav" href="http://news.baidu.com" name="tj_trnews">æ–°é—»</a> <a class="mnav" href="http://www.hao123.com" name="tj_trhao123">hao123</a> <a class="mnav" href="http://map.baidu.com" name="tj_trmap">地图</a> <a class="mnav" href="http://v.baidu.com" name="tj_trvideo">视频</a> <a class="mnav" href="http://tieba.baidu.com" name="tj_trtieba">贴吧</a> <noscript> <a class="lb" href="http://www.baidu.com/bdorz/login.gif?login&amp;tpl=mn&amp;u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1" name="tj_login">登录</a> </noscript> <script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb">登录</a>');</script> <a class="bri" href="//www.baidu.com/more/" name="tj_briicon" style="display: block;">更多产品</a> </div> </div> </div> <div id="ftCon"> <div id="ftConw"> <p id="lh"> <a href="http://home.baidu.com">关于百度</a> <a href="http://ir.baidu.com">About Baidu</a> </p> <p id="cp">©2017 Baidu <a href="http://www.baidu.com/duty/">使用百度前必读</a>  <a class="cp-feedback" href="http://jianyi.baidu.com/">意见反馈</a>京ICP证030173 <img src="//www.baidu.com/img/gs.gif"> </img></p> </div> </div> </div>, ' ']
     8 百度一下,你就知道
     9  
    10 <div id="wrapper"> <div id="head"> <div class="head_wrapper"> <div class="s_form"> <div class="s_form_wrapper"> <div id="lg"> <img height="129" hidefocus="true" src="//www.baidu.com/img/bd_logo1.png" width="270"> </img></div> <form action="//www.baidu.com/s" class="fm" id="form" name="f"> <input name="bdorz_come" type="hidden" value="1"> <input name="ie" type="hidden" value="utf-8"> <input name="f" type="hidden" value="8"> <input name="rsv_bp" type="hidden" value="1"> <input name="rsv_idx" type="hidden" value="1"> <input name="tn" type="hidden" value="baidu"><span class="bg s_ipt_wr"><input autocomplete="off" autofocus="" class="s_ipt" id="kw" maxlength="255" name="wd" value=""/></span><span class="bg s_btn_wr"><input class="bg s_btn" id="su" type="submit" value="百度一下"/></span> </input></input></input></input></input></input></form> </div> </div> <div id="u1"> <a class="mnav" href="http://news.baidu.com" name="tj_trnews">æ–°é—»</a> <a class="mnav" href="http://www.hao123.com" name="tj_trhao123">hao123</a> <a class="mnav" href="http://map.baidu.com" name="tj_trmap">地图</a> <a class="mnav" href="http://v.baidu.com" name="tj_trvideo">视频</a> <a class="mnav" href="http://tieba.baidu.com" name="tj_trtieba">贴吧</a> <noscript> <a class="lb" href="http://www.baidu.com/bdorz/login.gif?login&amp;tpl=mn&amp;u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1" name="tj_login">登录</a> </noscript> <script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb">登录</a>');</script> <a class="bri" href="//www.baidu.com/more/" name="tj_briicon" style="display: block;">更多产品</a> </div> </div> </div> <div id="ftCon"> <div id="ftConw"> <p id="lh"> <a href="http://home.baidu.com">关于百度</a> <a href="http://ir.baidu.com">About Baidu</a> </p> <p id="cp">©2017 Baidu <a href="http://www.baidu.com/duty/">使用百度前必读</a>  <a class="cp-feedback" href="http://jianyi.baidu.com/">意见反馈</a>京ICP证030173 <img src="//www.baidu.com/img/gs.gif"> </img></p> </div> </div> </div>
    11  
    12 <--tag a-->
    13 <class 'bs4.element.NavigableString'>
    14 新闻
    15 
    16 <--tag name-->
    17 a
    18 div
    19 
    20 <--tag attributes-->
    21 {'name': 'tj_trnews', 'class': ['mnav'], 'href': 'http://news.baidu.com'}
    22 tj_trnews
    View Code

      

    3. 基于bs4库的HTML内容遍历方法 

     

     

     

       

    #CSS查询

    1 print(soup.select('.sister'))        #按照class查找,并查找全部。注意查找要加. 
    2 print(soup.select('#link1'))       #按照id来查找,注意加#
    3 print(soup.select('head > title')    #按照父子关系来找。

    4 a_s = soup.select('a')           #利用循环遍历,打印所有的a标签的内容
    5 for a in a_s:
    6   print(a)

    HTMLparse ()   # In Python 3.0, the HTMLParser module has been renamed to html.parser you can check about this here

    Python 3.0
    import html.parser
    
    Python 2.2 and above
    import HTMLParser  

     

    Sqlite3  #Python自带模块,体积小,方便灵活。

     1 import sqlite3
     2 
     3 conn = sqlite3.connect('test')
     4 create_sql = 'create table company(id int primary key not null, emp_name test not null);'
     5 conn.execute(create_sql)
     6 insert_sql = 'insert into company values(?, ?)'    #使用了参数化的方法,防止sql注入攻击。拼字符串的化容易被攻击。
     7 conn.execute(insert_sql, (100, 'LY'))
     8 conn.execute(insert_sql, (200, 'July'))
     9 cursors = conn.execute('select id, emp_name from company')
    10 for row in cursors:
    11     print(row[0],row[1])
    12 
    13 conn.close()

    mysql注意事项:

    1. mysql需要加 host(ip/port), username, psassword. 

    2. 在每一行执行代码的时候,要价conn.commit()

    https://www.tutorialspoint.com/python/python_database_access.htm

  • 相关阅读:
    shp2pgsql向postgresql导入shape数据
    node.js的Promise库-bluebird示例
    iOS中点击事件失效的解决办法
    [PHP] 获取IP 和JS获取IP和地址
    [Bootstrap ] 模态框(Modal)插件
    [html][javascript] 关于SVG环形进度条
    [javascript] js实现小数的算术运算方法
    [GO] linux 下安装GO
    小知识点:session的存放位置
    [linux] linux的top命令参数详解
  • 原文地址:https://www.cnblogs.com/tlfox2006/p/8413903.html
Copyright © 2011-2022 走看看