zoukankan      html  css  js  c++  java
  • Python爬虫实战(三):爬网易新闻

    代码:

    # _*_ coding:utf-8 _*_
    import urllib2
    import re
    #import sys
    
    #reload(sys)
    #sys.setdefaultencoding('utf-8')
        
    class Tool:
        removeImg = re.compile(r'<p class="f_center".*?</p>')
        removeAddr = re.compile(r'<a.*?>|</a>')
        replaceLine = re.compile(r'<tr>|<div>|</div>|</p>')
        replaceTD = re.compile(r'<td>')
        replacePara = re.compile(r'<p.*?>')
        replaceBR = re.compile(r'<br<br>|<br>')
        removeExtraTag = re.compile(r'<.*?>')
    
        def replace(self,text):
            text = re.sub(self.removeImg,"",text)
            text = re.sub(self.removeAddr,"",text)
            text = re.sub(self.replaceLine,"
    ",text)
            text = re.sub(self.replaceTD,"	",text)
            text = re.sub(self.replacePara,"
    "+"  ",text)
            text = re.sub(self.replaceBR,"
    ",text)
            text = re.sub(self.removeExtraTag,"",text)
            return text.strip()
            
    
    class WYXW:
        def __init__(self,baseUrl):
            self.baseURL = baseUrl
            self.user_agent = 'Mozilla/4.0 (compatible;MSIE 5.5; Windows NT)'
            self.headers = {'User-Agent':self.user_agent}
            #self.file = None
            self.fileName = u'网易新闻'
            self.tool = Tool()
    
        def get_homepage(self):
            url = self.baseURL
            request = urllib2.Request(url,headers = self.headers)
            response = urllib2.urlopen(request)
            content = response.read().decode('utf-8','ignore')
            #print content#.encode('gbk','ignore')
            return content
    
        def extract_url(self,homepage):
            pattern = "http://news.163.com/d{2}/d{4}/d{2}/w{16}.html"
            news_url = re.findall(pattern,homepage)
            #print news_url
            return news_url
    
        def extract_sub_web_time(self,sub_web):
            pattern = re.compile(r'd{4}-d{2}-d{2} d{2}:d{2}:d{2}',re.S)
            time = re.findall(pattern,sub_web)
            print time[0]
            return time[0]
    
        def extract_sub_web_source(self,sub_web):
            pattern = re.compile(r'<a id="ne_article_source".*?>(.*?)</a>')
            source = re.findall(pattern,sub_web)
            print source[0]
            return source[0]
    
        def extract_sub_web_title(self,sub_web):
            #pattern = "<title>.+</title>"
            #pattern = '<h1 id="h1title" class="ep-h1">(.*?)</h1>'
            pattern = re.compile(r'<h1 id="h1title" class="ep-h1">(.*?)</h1>',re.S)
            title = re.findall(pattern,sub_web)
            if title is not None:
                print title[0]
                return title[0]
            else:
                return None
    
        def extract_sub_web_content(self,sub_web):
            #pattern = "<div id="Cnt-Main-Article-QQ".*</div>"
            pattern = re.compile(r'<div id="endText".*?>(.*?)<!.*?-->',re.S)
            content = re.findall(pattern,sub_web)
            #print content[0]
            if content is not None:
                return content[0]
            else:
                return None
    
        def writeData(self,fName):
            if fName is not None: 
                file = open(fName + '.txt',"w+")
            else:
                file = open(self.fileName + '.txt',"w+")
            homepage = self.get_homepage()
            news_urls = self.extract_url(homepage)
            for url in news_urls:
                print url
                web = urllib2.urlopen(url).read()
                title = self.extract_sub_web_title(web).strip()
                content = self.extract_sub_web_content(web)
                time = self.extract_sub_web_time(web).strip()
                source = self.extract_sub_web_source(web).strip()
                if content is not None:
                    content = self.tool.replace(content)
                    news = title + "
    
    " + time + "	" + source + "
    
    " + content + "
    "
                    file.write(news)
                    sep = "
    " + "-------------------------------------------------------------------------" + "
    "
                    file.write(sep)
                    print u"新闻写入成功" + "
    "
      
    
    baseUrl = "http://news.163.com"
    wyxw = WYXW(baseUrl)
    wyxw.writeData(None)
  • 相关阅读:
    mabatis的批量新增sql 初级的 初级的 初级的
    (批量更新)对多个符合条件的id做更新操作
    js中的 !! 和 ! 的区别
    js中===和==的区别
    select下拉框使用完毕后,重置按钮使其清空
    select下拉框的数据回显
    字符串拼串 能缓解我们的开发难度→!←(ε=(´ο`*)))唉,又是一个不知道该怎么写题目的随笔啊,头疼)
    select下拉框可以直接取list里的内容 不用非得转map (不得不承认我是个ZZ,这么简单的问题才反应过来,--^--)
    sql中某条件不为空,可能有的小祖宗会喷了,这还用总结?emmm,我渣,我觉得有一点意思对于第二种(土味)
    左查询left join on简单总结
  • 原文地址:https://www.cnblogs.com/AndyJee/p/5003385.html
Copyright © 2011-2022 走看看