zoukankan      html  css  js  c++  java
  • 爬取大半导体网新闻内容保存到word(基于python3.6)

    #!/usr/bin/python3
    # -*- coding: utf-8 -*-
    # @File : Spider
    # @Author : moucong
    # @Date : 2018/12/25 16:36
    # @Software: PyCharm
    from urllib import request
    from bs4 import BeautifulSoup
    from urllib.parse import quote
    from docx.shared import Inches
    from docx.oxml.ns import qn
    import string
    import time
    import re
    import docx
    import os




    def spider():
    url = "http://www.semi.org.cn/news/news_show.aspx?ID=54725&classid=128"
    main_url = "http://www.semi.org.cn"
    page = request.urlopen(url).read().decode('utf-8')
    # html = page.read().decode('utf-8')
    soup = BeautifulSoup(page, "lxml")
    title = soup.title.string
    title = title.replace('\n', '').replace('\t', '').replace('\r', '').replace("_SEMI大半导体产业网", '')

    patt = re.compile(r'<p>(.*?)</p>|<img (src = ".*?")>', re.S) #寻找img和p标签
    group = patt.findall(page)
    content_list = str(group[0]).split("<br />")
    file = docx.Document()
    for count in range(len(content_list)):
    x = 0
    if "img" in content_list[count]:
    path = "E:/SEMI_job/SEMI_Spider/pic/"
    if not os.path.isdir(path):
    os.makedirs(path)
    paths = path + '\'
    pic = re.compile('src="(.*?)"')
    pic_img = content_list[count]
    pic_url = pic.findall(pic_img)
    picurl = main_url+str(pic_url[0])
    if ' ' in picurl:
    picurl = replace(picurl)

    picurl = quote(picurl, safe=string.printable)
    pic_path = "E:/SEMI_job/SEMI_Spider/pic/%s.jpg" % x
    pic = request.urlretrieve(picurl, pic_path)
    x = x+1
    file.add_picture(pic_path, width=Inches(3.0))

    elif "strong" in content_list[count]:
    strong_font = re.compile('<strong>(.*?)</strong>')
    strong_type = strong_font.findall(content_list[count])
    p = file.add_paragraph()
    run = p.add_run(strong_type)
    # 加粗
    run.font.bold = True
    # print(strong_type)
    else:
    file.styles['Normal'].font.name = u'宋体'
    file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') #处理word里的字体样式
    content_part = content_list[count].replace('\r', '').replace('\n', '').replace('\t', '')
    file.add_paragraph(content_part)
    # print(content_part)

    file.save("E:SEMI_jobSEMI_SpiderwriteResult.docx")
    print("已处理好!")
  • 相关阅读:
    Python异常详解:基类、具体异常、异常层次结构
    Python视频教程,百度云资源,免费分享
    Python学习路线图(内附14张思维导图)
    Python视频教程免费下载,最新Python免费教程视频分享!
    怎样通过互联网ssh访问家里电脑
    linux下,把屏幕竖起来
    python中函数的不定长参数
    python中全局变量和局部变量
    vbox+Vagrant 入门指南
    python中函数返回多个值
  • 原文地址:https://www.cnblogs.com/setname/p/10195397.html
Copyright © 2011-2022 走看看