zoukankan      html  css  js  c++  java
  • 爬取大半导体网新闻内容保存到word(基于python3.6)

    #!/usr/bin/python3
    # -*- coding: utf-8 -*-
    # @File : Spider
    # @Author : moucong
    # @Date : 2018/12/25 16:36
    # @Software: PyCharm
    from urllib import request
    from bs4 import BeautifulSoup
    from urllib.parse import quote
    from docx.shared import Inches
    from docx.oxml.ns import qn
    import string
    import time
    import re
    import docx
    import os




    def spider():
    url = "http://www.semi.org.cn/news/news_show.aspx?ID=54725&classid=128"
    main_url = "http://www.semi.org.cn"
    page = request.urlopen(url).read().decode('utf-8')
    # html = page.read().decode('utf-8')
    soup = BeautifulSoup(page, "lxml")
    title = soup.title.string
    title = title.replace('\n', '').replace('\t', '').replace('\r', '').replace("_SEMI大半导体产业网", '')

    patt = re.compile(r'<p>(.*?)</p>|<img (src = ".*?")>', re.S) #寻找img和p标签
    group = patt.findall(page)
    content_list = str(group[0]).split("<br />")
    file = docx.Document()
    for count in range(len(content_list)):
    x = 0
    if "img" in content_list[count]:
    path = "E:/SEMI_job/SEMI_Spider/pic/"
    if not os.path.isdir(path):
    os.makedirs(path)
    paths = path + '\'
    pic = re.compile('src="(.*?)"')
    pic_img = content_list[count]
    pic_url = pic.findall(pic_img)
    picurl = main_url+str(pic_url[0])
    if ' ' in picurl:
    picurl = replace(picurl)

    picurl = quote(picurl, safe=string.printable)
    pic_path = "E:/SEMI_job/SEMI_Spider/pic/%s.jpg" % x
    pic = request.urlretrieve(picurl, pic_path)
    x = x+1
    file.add_picture(pic_path, width=Inches(3.0))

    elif "strong" in content_list[count]:
    strong_font = re.compile('<strong>(.*?)</strong>')
    strong_type = strong_font.findall(content_list[count])
    p = file.add_paragraph()
    run = p.add_run(strong_type)
    # 加粗
    run.font.bold = True
    # print(strong_type)
    else:
    file.styles['Normal'].font.name = u'宋体'
    file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') #处理word里的字体样式
    content_part = content_list[count].replace('\r', '').replace('\n', '').replace('\t', '')
    file.add_paragraph(content_part)
    # print(content_part)

    file.save("E:SEMI_jobSEMI_SpiderwriteResult.docx")
    print("已处理好!")
  • 相关阅读:
    Cordova-conifg.xml配置
    Cordova插件开发
    android shape的使用
    Cordova
    性能优化
    ionic默认样式android和ios差异
    在IIS中部署ASP.NET 5应用程序遭遇的问题
    Ionic命令大全
    IOS开发
    Cordova 8 架构使用sqlite
  • 原文地址:https://www.cnblogs.com/setname/p/10195397.html
Copyright © 2011-2022 走看看