zoukankan      html  css  js  c++  java
  • Chinadaily双语新闻爬取

    今天临时需要爬取一些双语资料

    (尚未清洗)

    需要充分利用

    下边代码是想拿到Chinadaily网页中每篇双语新闻的链接,首先研究这些网页的网址和网页结构,包括翻页一般是首页网址加上_2,_3...等等。所以以下代码只是拿到链接。

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    """
    File: bi_news.py
    Author: ZhangHaiou(hozhangel@126.com)
    Date: 2018/05/04
    """
    
    import urllib
    import re
    import os
    
    bi_urls = []
    def getHtml(url):    #读取网页内容
        page = urllib.urlopen(url)
        html = page.readlines()
        #print html
        return html
    
    def getImg(html):
        reg = r'src="(.+?.jpg)" pic_ext'
        imgre = re.compile(reg)
        imglist = re.findall(imgre,html)
        x = 0
        for imgurl in imglist:
            urllib.urlretrieve(imgurl,'%s.jpg' % x)
            x+=1
        
    def geturl(html):   #读取网页中需要的链接
        for line in html:
            if re.search('<div class="mr10"><a href="dddd-dd/dd/content\_d{4,}.htm"',line):
                if re.search('<div class="mr10"><a href="2016-dd/dd/content\_d{4,}.htm"',line):        #只是想拿到2016年之后的语料      
                    os._exit(0)
                else:
                    url = re.findall(r'dddd-dd/dd/content\_d{4,}.htm',line)
                    print("http://language.chinadaily.com.cn/" + url[0])
                    bi_urls.append("http://language.chinadaily.com.cn/" + url[0])
    
                    
    if __name__ == '__main__':        
        n = 1
        # os.system('wget -r --spider http://language.chinadaily.com.cn/news_bilingual.html')
        # #geturl(getHtml("http://language.chinadaily.com.cn/news_bilingual.html"))
        # '''
        while n:
            if(n < 2):
                html = getHtml("http://language.chinadaily.com.cn/news_bilingual.html")
                
            elif(n > 1):
                html = getHtml("http://language.chinadaily.com.cn/news_bilingual_" + str(n) + ".html" )
            geturl(html)
            n = n + 1

    执行python bi_news.py >url.txt 把想要的网址保存

    url.txt内容:

    下一步是简单爬取把url中每行链接的网页内容,且把新闻按照月份整理进入文件夹,文件名是每个新闻链接的后面八位数字

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    
    """
    File: content.py
    Author: ZhangHaiou(hozhangel@126.com)
    Date: 2018/05/04
    """
    
    import urllib
    import re
    import os
    import sys
    bi_urls = []
    def getHtml(url):
        page = urllib.urlopen(url)
        html = page.read()
        #print html
        return html
    
    def getImg(html):
        reg = r'src="(.+?.jpg)" pic_ext'
        imgre = re.compile(reg)
        imglist = re.findall(imgre,html)
        x = 0
        for imgurl in imglist:
            urllib.urlretrieve(imgurl,'%s.jpg' % x)
            x+=1
        
    def geturl(html):
        for line in html:
            if re.search('<div class="mr10"><a href="dddd-dd/dd/content\_d{4,}.htm"',line):
                if re.search('<div class="mr10"><a href="2016-dd/dd/content\_d{4,}.htm"',line):                
                    os._exit(0)
                else:
                    url = re.findall(r'dddd-dd/dd/content\_d{4,}.htm',line)
                    print(url)
                    bi_urls.append(url)
    def savefile(savepath, content):
        with open(savepath, "w") as fp:
            fp.write(content)
                    
    if __name__ == '__main__':        
    
        for line in open(sys.argv[1],'r'):
            content = ""
            n = 1
            while n: #这个循环是为了不遗漏需要翻页的新闻
                if n > 1:
                    htm = line + "_" + str(n)
                else:
                    htm = line
                raw = getHtml(htm)
                
                if not re.findall(r'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',raw): #避免空白页
                    break
                print(htm)
                n = n + 1
                # for hang in raw:
                    # if re.search('^<p>.*</p>',hang):
                content = content + raw
            date = re.findall(r'dddd-dd',line)[0]
            filename = re.findall(r'd{6,}',line)[0]
            if not os.path.exists(date):  # 是否存在目录
                os.makedirs(date)
            savefile(date + "/" + filename + ".txt" , content)
            
          
  • 相关阅读:
    java位运算
    java笔试题(面试题)系列之一
    Java数据类型转换总结
    ++a和a++
    Java IO
    Java 并发
    Java位运算及补码存储
    Redis 5.0.0安装部署(伪集群版)
    Redis 安装(单机版)
    Dubbo源码分析系列之【服务暴露】
  • 原文地址:https://www.cnblogs.com/hozhangel/p/8990818.html
Copyright © 2011-2022 走看看