zoukankan      html  css  js  c++  java
  • python爬虫前提技术

    1、BeautifulSoup 解析html如何使用
    
    转自:http://blog.csdn.net/u013372487/article/details/51734047
    
    
    #!/usr/bin/python
    # -*- coding: UTF-8 -*-
    from bs4 import BeautifulSoup
    import re
    
    #待分析字符串
    html_doc = """
    <html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
    <p class="title aq">
        <b>
            The Dormouse's story
        </b>
    </p>
    
    <p class="story">Once upon a time there were three little sisters; and their names were
        <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
        <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> 
        and
        <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
        and they lived at the bottom of a well.
    </p>
    
    <p class="story">...</p>
    """
    
    
    # html字符串创建BeautifulSoup对象
    soup = BeautifulSoup(html_doc, 'html.parser', from_encoding='utf-8')
    
    #输出第一个 title 标签
    print soup.title
    
    #输出第一个 title 标签的标签名称
    print soup.title.name
    
    #输出第一个 title 标签的包含内容
    print soup.title.string
    
    #输出第一个 title 标签的父标签的标签名称
    print soup.title.parent.name
    
    #输出第一个  p 标签
    print soup.p
    
    #输出第一个  p 标签的 class 属性内容
    print soup.p['class']
    
    #输出第一个  a 标签的  href 属性内容
    print soup.a['href']
    '''
    soup的属性可以被添加,删除或修改. 再说一次, soup的属性操作方法与字典一样
    '''
    #修改第一个 a 标签的href属性为 http://www.baidu.com/
    soup.a['href'] = 'http://www.baidu.com/'
    
    #给第一个 a 标签添加 name 属性
    soup.a['name'] = u'百度'
    
    #删除第一个 a 标签的 class 属性为
    del soup.a['class']
    
    ##输出第一个  p 标签的所有子节点
    print soup.p.contents
    
    #输出第一个  a 标签
    print soup.a
    
    #输出所有的  a 标签,以列表形式显示
    print soup.find_all('a')
    
    #输出第一个 id 属性等于  link3 的  a 标签
    print soup.find(id="link3")
    
    #获取所有文字内容
    print(soup.get_text())
    
    #输出第一个  a 标签的所有属性信息
    print soup.a.attrs
    
    
    for link in soup.find_all('a'):
        #获取 link 的  href 属性内容
        print(link.get('href'))
    
    #对soup.p的子节点进行循环输出    
    for child in soup.p.children:
        print(child)
    
    #正则匹配,名字中带有b的标签
    for tag in soup.find_all(re.compile("b")):
        print(tag.name)
    
    
    
    2、cookie等使用方法以及函数爬虫
    
    参照: https://cuiqingcai.com/968.html
    
    
    3、header,代理,超时,认证,异常处理
    
    参照:  http://blog.csdn.net/m_buddy/article/details/55193762
    
    
    4、错误异常处理
    
    
    1.URLError
    
    # -*- coding: UTF-8 -*-
    import urllib
    import urllib
    from urllib import request
    import re
    import requests
    import urllib.parse
    import urllib.request
    from urllib.request import Request, urlopen
    from urllib.error import URLError, HTTPError
    
    if __name__ == "__main__":
        #一个不存在的连接
        url = "http://www.douyu.com/Jack_Cui.html"
        request = urllib.request.Request(url)
        try:
            response = urllib.request.urlopen(request)
            # html = responese.read()
        except urllib.error.HTTPError as e:
            print(e.code)
    
    
    运行结果:
    
    C:Python34python.exe G:/xiaoshuo2.py
    403
    
    Process finished with exit code 0
    
    
    
    
    
    
    # -*- coding: UTF-8 -*-
    import urllib
    import urllib
    from urllib import request
    import re
    import requests
    import urllib.parse
    import urllib.request
    from urllib.request import Request, urlopen
    from urllib.error import URLError, HTTPError
    
    if __name__ == "__main__":
        #一个不存在的连接
        url = "http://www.douyu.com/Jack_Cui.html"
        request = urllib.request.Request(url)
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode('utf-8')
            print(html)
        except urllib.error.HTTPError as e:
            print(e.code)
    
    
    运行结果:
    
    C:Python34python.exe G:/xiaoshuo2.py
    403
    
    
    Process finished with exit code 0
    
    
    
    
    
    import urllib
    import urllib
    from urllib import request
    import re
    import requests
    import urllib.parse
    import urllib.request
    from urllib.request import Request, urlopen
    from urllib.error import URLError, HTTPError
    
    url = "http://www.douyu.com/Jack_Cui.html"
    
    rep=urllib.request.Request(url)
    try:
        data=urllib.request.urlopen(rep)
    except urllib.error.URLError as e:
            if hasattr(e,'code'):
                print("HTTPError")
                print(e.code)
            if hasattr(e,'reason' ):
                print("URLError")
                print(e.reason)
    
    
    输出结果:
    
    
    C:Python34python.exe G:/xiaoshuo2.py
    HTTPError
    403
    URLError
    Forbidden
    
    Process finished with exit code 0
    
    
    
    
    
    5、python打印防止换行和换行
    
    
    https://www.cnblogs.com/kfx2007/p/5970784.html
    
    
    实例:
    
    
    # coding=utf-8
    import re
    
    language = '''''
    <table class="infobox bordered vcard" style=" 21em; font-size: 89%; text-align: left;" cellpadding="3">
    <caption style="text-align: center; font-size: larger;" class="fn"><b>jenkins</b></caption>
    <tr>
    <th>性別:</th>
    <td>男</td>d
    </tr>
    <tr>
    <th>異名:</th>
    <td><span class="nickname">(字) 翔宇</span></td>
    </tr>
    <tr>
    <th>爱好:</th>
    <td><span class="org"><a href="../articles/%E4%B8%AD9A.html" title="篮球">篮球</a></span></td>
    </tr>
    <tr>
    <th>籍貫:</th>
    <td><a href="../articles/%E6%B5%9981.html" title="广西省">广西省</a><a href="../articles/%E7%BB%8D82.html" title="桂林市">桂林市</a></td>
    </tr>
    </table>
    '''
    
    #获取table中tr值
    res_tr = r'<tr>(.*?)</tr>'
    m_tr =  re.findall(res_tr,language,re.S|re.M)
    for line in m_tr:
        #获取表格第一列th 属性
        res_th = r'<th>(.*?)</th>'
        m_th = re.findall(res_th,line,re.S|re.M)
        for mm in m_th:
            if "href" in mm: #如果获取加粗的th中含超链接则处理
                restr = r'<a href=.*?>(.*?)</a>'
                h = re.findall(restr,mm,re.S|re.M)
                print (h[0],end=' ') #逗号连接属性值 防止换行
            else:
                print (mm,end=' ')   #unicode防止乱
    
        #获取表格第二列td 属性值
        res_td = r'<td>(.*?)</td>'  #r'<td .*?>(.*?)</td>'
        m_td = re.findall(res_td,line,re.S|re.M)
        for nn in m_td:
            if "href" in nn: #处理超链接<a href=../rel=..></a>
                res_value = r'<a .*?>(.*?)</a>'
                m_value = re.findall(res_value,nn,re.S|re.M)
                for value in m_value:
                    print (value,end=' ')
            elif "span" in nn: #处理标签<span>
                res_value = r'<span .*?>(.*?)</span>'
                m_value = re.findall(res_value,nn,re.S|re.M) #<td><span class="nickname">(字) 翔宇</span></td>
                for value in m_value:
                    print (value,end=' ')
            else:
                print (nn,end=' ')
            print (' ') #换行
    
    
    
    C:Python34python.exe G:/xiaoshuo2.py
    性別: 男  
    異名: (字) 翔宇  
    爱好: 篮球  
    籍貫: 广西省 桂林市  
    
    
    
    6、python打印如何呢不换行
    
    
    https://www.cnblogs.com/hwd9654/p/5707920.html
    
    
    # -*- coding:utf-8 -*-
    import urllib
    import re
    #import requests
    import urllib.parse
    import urllib.request
    from urllib.request import Request, urlopen
    from urllib.error import URLError, HTTPError
    
    class Tool:
        removeImg = re.compile('<img.*?>| {7}|')
        removeAddr = re.compile('<a.*?>|</a>')
        replaceLine = re.compile('<tr>|<div>|</div>|</p>')
        replaceTD= re.compile('<td>')
        replacePara = re.compile('<p.*?>')
        replaceBR = re.compile('<br><br>|<br>')
        removeExtraTag = re.compile('<.*?>')
        def replace(self,x):
            x = re.sub(self.removeImg,"",x)
            x = re.sub(self.removeAddr,"",x)
            x = re.sub(self.replaceLine,"
    ",x)
            x = re.sub(self.replaceTD,"	",x)
            x = re.sub(self.replacePara,"
    ",x)
            x = re.sub(self.replaceBR,"
    ",x)
            x = re.sub(self.removeExtraTag,"",x)
            return x.strip()
    
    class BDTB:
        def __init__(self,baseUrl,seeLZ):
            self.baseURL = baseUrl
            self.seeLZ = '?see_lz='+str(seeLZ)
            self.tool = Tool()
        def getPage(self,pageNum):
            try:
                url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
                request = urllib.request.Request(url)
                response = urllib.request.urlopen(request).read().decode("utf8")
                #print (response)
    
                return response
            except urllib.error.URLError as e:
                if hasattr(e,"reason"):
                    print ("连接百度贴吧失败,错误原因",e.reason)
                    return None
        def getTitle(self):
            page = self.getPage(1)
            pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
            result = re.search(pattern,page)
            if result:
            #print (result.group(1))
                return result.group(1).strip()
            else:
                return None
    
        def getPageNum(self):
            page = self.getPage(1)
            pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S)
            result = re.search(pattern,page)
            #print (result.group(1))
            if result:
                return result.group(1).strip()
            else:
                return None
        def getContent(self,page):
            pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
            items = re.findall(pattern,page)
            floor = 1
            for item in items:
                print (floor,"楼-------------------------------------------------------------------------------------
    ",end='')
                #print ("楼---------------------------------------------------------------------------------------------------------------
    ")
                print (self.tool.replace(item))
                floor += 1
    
    
    baseURLh = 'http://tieba.baidu.com/p/3138733512'
    bdtb = BDTB(baseURLh,1)
    bdtb.getContent(bdtb.getPage(1))
    
    
    
    
    打印结果:
    
    C:Python34python.exe C:/Users/Administrator/ceshi.py
    1 楼-------------------------------------------------------------------------------------
    很多媒体都在每赛季之前给球员排个名,我也有这个癖好…………,我会尽量理性的分析球队地位,个人能力等因素,评出我心目中的下赛季50大现役球员,这个50大是指预估他本赛季在篮球场上对球队的影响力……不是过去的荣誉什么的,所以难免有一定的主观性……如果把你喜欢的球星排低了,欢迎理性讨论!
    
    状元维金斯镇楼
    P.S 1 我每天都至少更新一个,不TJ。
          2 今年的新秀我就不考虑了,没上赛季参照
    2 楼-------------------------------------------------------------------------------------
    50 惊喜新人王 迈卡威
    上赛季数据
    篮板 6.2  助攻 6.3  抢断 1.9 盖帽  0.6 失误 3.5 犯规  3  得分 16.7
    
    新赛季第50位,我给上赛季的新人王迈卡威。 上赛季迈卡威在彻底重建的76人中迅速掌握了球队,一开始就三双搞定了热火赢得了万千眼球。后来也屡屡有经验的表现,新秀赛季就拿过三双的球员不多,迈卡威现在可以说在76人站稳了脚跟。
    
    
    
    
    7、python爬虫xpath的语法
    
    
    http://www.cnblogs.com/lonenysky/p/4649455.html
    
    
    //*[@id="AD_4586850"]/div[1]/strong/i
    
    //*[@id="shop_list"]/div[1]/strong/i
    //*[@id="shop_list"]
    
    
    
    
    8、requests用法
    
    
    http://cuiqingcai.com/2556.html
    
    #-*—coding:utf8-*-
    from lxml import etree
    import requests
    import re
    #编码转换
    import sys
    #headers构造一个字典,里面保存了user-agent
    #headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }
    html = requests.get('http://cuiqingcai.com')
    print(html.text)
    
    
    
    9、sub使用
    
    
    http://blog.csdn.net/lovemianmian/article/details/8867613
    
    
    1、去除imag标签
    
    import re
    text ='<imgJGood is a handsome boy,>         he is cool, clever, and so on...'
    removeImg = re.compile('<img.*?>')
    s=re.sub(removeImg,"",text).strip()
    print (s)
    
    
    C:Python34python.exe G:/xiaoshuo2.py
    he is cool, clever, and so on...
    
    
    
    
    
    1、1 单独去除7位长空格
    
    import re
    text ='<imgJGood is a handsome boy,>         he is cool, clever, and so on...'
    removeImg = re.compile('| {7}|')
    s=re.sub(removeImg,"",text).strip()
    print (s)
    
    打印
    
    C:Python34python.exe G:/xiaoshuo2.py
    <imgJGood is a handsome boy,>         he is cool, clever, and so on...
    
    
    
    
    2、去除imag标签 + 去除7位长空格
    
    
    import re
    text ='<imgJGood is a handsome boy,>         he is cool, clever, and so on...'
    removeImg = re.compile('<img.*?>| {7}|')
    s=re.sub(removeImg,"",text).strip()
    print (s)
    
    
    
    打印:
    
    
    C:Python34python.exe G:/xiaoshuo2.py
    he is cool, clever, and so on...
    
    Process finished with exit code 0
    
    
    
    
    3、去除imag标签 + 保留7位长空格
    
    
    import re
    text ='<imgJGood is a handsome boy,>         he is cool, clever, and so on...'
    
    
    removeImg = re.compile('<img.*?>{7}')
    s=re.sub(removeImg,"",text).strip()
    print (s)
    
    
    打印:
    
    
    C:Python34python.exe G:/xiaoshuo2.py
    <imgJGood is a handsome boy,>         he is cool, clever, and so on...
    
    Process finished with exit code 0
    
    
    
    4、把两个标签中间的内容去掉
    
    
    
    import re
    text='<a href="http://jump2.bdimg.com/safecheck/index?url=x+Z5)">迈卡威</a>刷出了不错的数据'
    removeImg = re.compile('<a.*?>|</a>')
    s=re.sub(removeImg,"",text).strip()
    print (s)
    
    
    打印:
    
    
    C:Python34python.exe G:/xiaoshuo2.py
    迈卡威刷出了不错的数据
    
    
    
    
    5,把<br>换行符换成/n 换行符
    
    
    
    import re
    text ='height="510"><br><br><br><br>状元维金斯镇楼<br>P.S 1 我每天都至少更新一个,不TJ。<br>      2 今年的新秀我就不考虑了,没上赛季参照'
    removeImg = re.compile('<br><br>|<br>')
    s=re.sub(removeImg,"
    ",text).strip()
    print (s)
    
    
    
    C:Python34python.exe G:/xiaoshuo2.py
    height="510">
    
    状元维金斯镇楼
    P.S 1 我每天都至少更新一个,不TJ。
          2 今年的新秀我就不考虑了,没上赛季参照
    
    
    
    
    5.1,把<br>换行符换成/n 换行符
    
    
    import re
    text ='height="510"><br><br><br><br>状元维金斯镇楼<br>P.S 1 我每天都至少更新一个,不TJ。<br>      2 今年的新秀我就不考虑了,没上赛季参照'
    removeImg = re.compile('<br>')
    s=re.sub(removeImg,"
    ",text).strip()
    print (s)
    
    
    
    C:Python34python.exe G:/xiaoshuo2.py
    height="510">
    
    
    
    状元维金斯镇楼
    P.S 1 我每天都至少更新一个,不TJ。
          2 今年的新秀我就不考虑了,没上赛季参照
    
    
    
    10、正则表达式
    
    
    <div class="list-item">
                <div class="personal-info">
                    <div class="pic-word">
                        <div class="pic s60">
                            <a href="//mm.taobao.com/687471686.htm" target="_blank" class="lady-avatar">        <img src="//gtd.alicdn.com/sns_logo/i2/TB1XZ1PQVXXXXaJXpXXSutbFXXX.jpg_60x60.jpg" alt="" width="60" height="60"/>
    </a>
                        </div>
                        <p class="top">
                        <a class="lady-name" href="//mm.taobao.com/self/model_card.htm?user_id=687471686" target="_blank">jenkins</a>
                        <em><strong>27</strong>岁</em>
                        <span>广州市</span>
    
    pattern = re.compile('<div class="list-item">.*? href="(.*?)".*? src="(.*?)".*? target="_blank">(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',re.S)
    或者
    
    pattern = re.compile('<div class="list-item">.*?<a href="(.*?)".*?<img src="(.*?)".*?<a class="lady-name".*?>(.*?)</a>.*?<strong>(.*?)</strong>.*?<span>(.*?)</span>',re.S)
    
    
    
    https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk&word=%C3%C0%C5%AE%CD%BC%C6%AC&fr=ala&ala=1&alatpl=cover&pos=0&hs=2&xthttps=111111
  • 相关阅读:
    HDL之Bitwise operation
    Embedded之memory type
    RFID之UID
    VHDL之package
    QS之vsim
    潭州课堂25班:Ph201805201 爬虫高级 第八课 AP抓包 SCRAPY 的图片处理 (课堂笔记)
    潭州课堂25班:Ph201805201 爬虫高级 第七课 sclapy 框架 爬前程网 (课堂笔)
    潭州课堂25班:Ph201805201 爬虫高级 第六课 sclapy 框架 中间建 与selenium对接 (课堂笔记)
    潭州课堂25班:Ph201805201 爬虫高级 第五课 sclapy 框架 日志和 settings 配置 模拟登录(课堂笔记)
    潭州课堂25班:Ph201805201 爬虫高级 第四课 sclapy 框架 crawispider类 (课堂笔记)
  • 原文地址:https://www.cnblogs.com/effortsing/p/10049597.html
Copyright © 2011-2022 走看看