zoukankan      html  css  js  c++  java
  • Python爬虫

    from requests import request
    resp = request('get', 'http://www.baidu.com')
    print resp.content

    Python爬虫遇到IOError或连接失败等将headers补充全尝试

     爬取贴吧图片

    import urllib2
    import urllib
    import re
    import time

    def gethtml(url):
    page = urllib2.Request(url)
    html = urllib2.urlopen(page)
    return html.read()

    def imgget(html):
    reg = r'src="(.+?.jpg)" size='
    img = re.compile(reg)
    imglist = re.findall(img, html)
    # return imglist

    for imgurl in imglist:
    x = long(time.time()*1000)
    urllib.urlretrieve(imgurl, r'E:MyProcodespidersTestimage\%s.jpg' % x)
    for i in range(1, 10):
    time.sleep(1)
    print 'start catch page %d' % i
    html = gethtml("https://tieba.baidu.com/p/4844779320?pn=%d" % i)
        imgget(html)

     Python爬取的网页read一次之后再次read会出问题,可将状态恢复至read前的状态

        def getpage(self, pn):
            try:
                url = self.baseurl+self.lzonly+'&pn='+pn.__str__()
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                # print response.read()
                return response.read()
            except urllib2.URLError, e:
                if hasattr(e, "reason"):
                    print u'连接错误,原因:'+e.reason
                    return None

     爬取贴吧帖子

    # -*- coding: utf-8 -*-
    __author__ = 'P00113'
    
    import urllib2
    import urllib
    import re
    import time
    
    
    # 处理页面标签类
    class Tool:
        # 去除img标签,7位长空格
        removeImg = re.compile('<img.*?>| {7}|')
        # 删除超链接标签
        removeAddr = re.compile('<a.*?>|</a>')
        # 把换行的标签换为
    
        replaceLine = re.compile('<tr>|<div>|</div>|</p>')
        # 将表格制表<td>替换为	
        replaceTD = re.compile('<td>')
        # 把段落开头换为
    加空两格
        replacePara = re.compile('<p.*?>')
        # 将换行符或双换行符替换为
    
        replaceBR = re.compile('<br><br>|<br>')
        # 将其余标签剔除
        removeExtraTag = re.compile('<.*?>')
    
        def replace(self, x):
            x = re.sub(self.removeImg, "", x)
            x = re.sub(self.removeAddr, "", x)
            x = re.sub(self.replaceLine, "
    ", x)
            x = re.sub(self.replaceTD, "	", x)
            x = re.sub(self.replacePara, "
        ", x)
            x = re.sub(self.replaceBR, "
    ", x)
            x = re.sub(self.removeExtraTag, "", x)
            # strip()将前后多余内容删除
            return x.strip()
    
    
    class NZTB(object):
        def __init__(self, baseurl, lzonly):
            self.baseurl = baseurl
            self.lzonly = '?see_lz=' + lzonly.__str__()
            self.tool = Tool()
        def getpage(self, pn):
            try:
                url = self.baseurl + self.lzonly + '&pn=' + pn.__str__()
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                # print response.read()
                return response.read()
            except urllib2.URLError, e:
                if hasattr(e, "reason"):
                    print u'连接错误,原因:' + e.reason
                    return None
    
        def gettitle(self):
            html = self.getpage(1)
            # html = '''%s''' % html
            # reg = r'<h3 class="core_title_txt.*?">(.*?)</h3>'
            pat = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>', re.S)
            res = re.search(pat, html)
            if res:
                # print '%s' % ('-'*100)
                print res.group(1)
                print res.group(1)
                return res.group(1).strip()
            else:
                print '%s' % ('-' * 100)
                return None
    
        def getpn(self):
            html = self.getpage(1)
            pat = re.compile('<li class="l_reply_num.*?<span.*?>(.*?)</span>', re.S)
            res = re.search(pat, html)
            if res:
                print res.group(1)
                return res.group(1)
            else:
                print '*****'
                return None
    
        def getcontent(self, pn):
            html = self.getpage(pn)
            pat = re.compile('<div id="post_content_.*?">(.*?)</div>', re.S)  # 匹配楼层正文
            res = re.findall(pat, html)
            # f_pat = re.compile('<div class="post-tail-wrap"><span.*?</span><span.*?</span><span.*?>(.*?)</span>', re.S)
            f_pat = re.compile('<div class="post-tail-wrap"><span.*?>(d*)楼', re.S)  # 匹配楼层
            f_res = re.findall(f_pat, html)
            # for v in f_res:
            #     print v
            for val, f in zip(res, f_res):
                # print val
                v = self.tool.replace(val)
                if v:
                    print f, u"楼%s" % ('-' * 100)
                    print v, '
    '
                    # floor += 1
                else:
                    continue
    
    
    if __name__ == '__main__':
        baseurl = 'http://tieba.baidu.com/p/5058456989'
        a = NZTB(baseurl, 0)
        # a.getpage(1)
        for i in range(1, 4):
            a.getcontent(i)

     Python连接数据库时出现  UnicodeEncodeError: 'latin-1' codec can't encode character

    如下加入几行代码解决

    import MySQLdb
    
    
    db_para = {'host': '10.10.12.171',
               'port': 3306,
               'user': 'root',
               'passwd': 'Hwroot@com',
               'db': 'test'}
    dbcon = MySQLdb.connect(**db_para)
    cur = dbcon.cursor()
    dbcon.set_character_set('utf8')
    cur.execute('SET NAMES utf8;')
    cur.execute('SET CHARACTER SET utf8;')
    cur.execute('SET character_set_connection=utf8;')
  • 相关阅读:
    堆、栈、值类型、引用类型分析总结 Part 2
    DataGridView打印
    学习使用ArrayList
    C#与Java之比较
    【原创】串口通信测试程序
    彩色校验码的制作
    C#中使用进度条
    【原创】 Ajax之ModalPopup编程实例
    常用正则表达式
    堆、栈、值类型、引用类型分析总结 Part 1
  • 原文地址:https://www.cnblogs.com/cmm2016/p/6703179.html
Copyright © 2011-2022 走看看