zoukankan      html  css  js  c++  java
  • python爬虫

    pypthon爬虫相关

    • 1、爬取图片至本地,目前仅能实现将百度帖里面的图片下载至本地,百度图片里面的图片还不能实现,等完善
    • 2、抓取糗百的段子,查看作者、段子内容,点赞个数
    • 3、更加人性化的查看方式,按下回车显示一个段子,如果按[q|Q]就直接退出

    代码

    • 1、用到的模块,urllib,urllib2,re
    • 2、url匹配用到了re模块
    • 3、文件下载使用了urllib.urlretrieve()来将分析出来的图片下载至本地

    1、百度贴吧图片

    #!/usr/bin/env python
    #-*- coding:utf8 -*-
    
    import urllib, urllib2
    import re
    
    def getHtml(url):
    	page = urllib2.urlopen(url)
    
    	return page.read()
    
    def getImage(html):
    	'''需要注意这里的.*?表示的是非贪婪匹配,如果遇到第一个>,那么就停止匹配'''
    	re_img = re.compile(r'<img class="BDE_Image" src="(.*?)".*?>')
    	img_list = re_img.findall( html )
    	i = 1
    
    	for imgurl in img_list:
    		print imgurl
    		'''使用urllib.urlretrieve()来将分析出来的图片下载至本地
    		'''
    		urllib.urlretrieve(imgurl, filename='%s.jpg' % i)
    		i += 1
    
    if __name__ == '__main__':
    	#url = 'http://tieba.baidu.com/p/3999261766'
    	#url = 'http://tieba.baidu.com/p/4957363500'
    	url = 'http://tieba.baidu.com/p/2263349749'
    	page = getHtml( url )
    	# print page
    	getImage( page )
    
    

    2、一次性获取指定页面的糗百段子

    #!/usr/bin/env python
    #-*- coding:utf8 -*-
    
    '''
    # 爬取糗百段子
    1. 攫取段子
    2. 过滤带有图片的段子
    3. 实现第按一次回车显示一个段子的发布人,段子内容,点赞个数
    '''
    
    import urllib, urllib2
    import re
    
    page = 2
    url = 'http://www.qiushibaike.com/hot/page/' + str(page)
    # print( url )
    # User-Agent: 封装
    headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0"}
    
    try:
    	req = urllib2.Request(url, headers=headers)
    	rsp = urllib2.urlopen(req)
    	html = rsp.read()
    except urllib2.URLError, e:
    	if hasattr(e, 'code'):
    		print e.code
    	if hasattr(e, 'reason'):
    		print e.reason
    
    re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?" alt="(.*?)"/>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(d+)</i>', re.S)
    items = re_page.findall( html )
    
    # print(items)
    for item in items:
    	for i in item:
    		print(i)
    

    3、实现第按一次回车显示一个段子的发布人,段子内容,点赞个数

    #!/usr/bin/env python
    #-*- coding:utf8 -*-
    
    '''
    # 爬取糗百段子
    1. 攫取段子
    2. 实现每按一次回车只显示一个段子
    '''
    
    import urllib, urllib2
    import re
    import sys
    
    def getPage(page_num):
       url = 'http://www.qiushibaike.com/hot/page/' + str(page_num)
       # print( url )
       # User-Agent: 封装
       headers = {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0"}
    
       try:
          req = urllib2.Request(url, headers=headers)
          rsp = urllib2.urlopen(req)
          html = rsp.read()
          # print html
    
          return html # 返回网页源码
       except urllib2.URLError, e:
          if hasattr(e, 'code'):
             print('连接服务器失败, 错误代码: %s' % e.code)
             return None
          if hasattr(e, 'reason'):
             print('连接服务器失败,错误原因: %s' % e.reason)
             return None
    
    def getPageContent(page_num=1):
       html = getPage(page_num)
       # print(html)
       # re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?" alt="(.*?)"/>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(d+)</i>', re.S)
       re_page = re.compile(r'<div class="author.*?>.*?<a.*?<img src=".*?"/>.*?</a>.*?<a.*?>.*?<h2>(.*?)</h2>.*?</a>.*?<a.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(d+)</i>', re.S)
       items = re_page.findall( html )
       # print( items[1] )
       # print( len(items) )
       page_contents = []
    
       for item in items:
       #  print(' item '.center(50, '='))
       #  print( item )
          page_contents.append((page_num, item[0].strip(), item[1].strip(), item[2].strip()))
          # page_contents.append('
    ')
       
       return page_contents
    
    def getOneStory(page_contents):
       for story in page_contents:
          input = raw_input()
          if input.lower() == 'q':
             sys.exit()
          # print(story[0], story[1], story[2], story[3])
          print('第%s页	发布人:%s	赞: %s
    %s' % (story[0], story[1], story[3], story[2]))
    
    if '__main__' == __name__:
       print("Loading web content from web site ...
     Press [q|Q] to exit, and press 'Enter' see next content: 
    ")
       num = 1
    
       while True:
          page_contents = getPageContent(num)
          getOneStory(page_contents)
          num += 1
    
       # page_content = getPageContent()
       # print(page_content)
       # for item in page_content:
       #  for i in item:
       #     print(i)
    

    未完,待续……

    Yesterday is history.
    Tomorrow is a mystery.
    But today is a gift.
    That is why it's called the present.
    The old game: give a wolf a taste, then keep him hungry.
  • 相关阅读:
    存储过程3前台
    最简单Login程序
    存储过程前台2
    程序员 开发工具箱
    存储过程4前台
    存储过程 insert
    公司网络解决方案
    存储过程前台
    linux常用指令
    ReentrantLock源码解析3优先响应中断的lockInterruptibly
  • 原文地址:https://www.cnblogs.com/ZhangRuoXu/p/6367132.html
Copyright © 2011-2022 走看看