zoukankan      html  css  js  c++  java
  • 爬取大半导体网新闻内容保存到word(基于python3.6)

    #!/usr/bin/python3
    # -*- coding: utf-8 -*-
    # @File : Spider
    # @Author : moucong
    # @Date : 2018/12/25 16:36
    # @Software: PyCharm
    from urllib import request
    from bs4 import BeautifulSoup
    from urllib.parse import quote
    from docx.shared import Inches
    from docx.oxml.ns import qn
    import string
    import time
    import re
    import docx
    import os




    def spider():
    url = "http://www.semi.org.cn/news/news_show.aspx?ID=54725&classid=128"
    main_url = "http://www.semi.org.cn"
    page = request.urlopen(url).read().decode('utf-8')
    # html = page.read().decode('utf-8')
    soup = BeautifulSoup(page, "lxml")
    title = soup.title.string
    title = title.replace('\n', '').replace('\t', '').replace('\r', '').replace("_SEMI大半导体产业网", '')

    patt = re.compile(r'<p>(.*?)</p>|<img (src = ".*?")>', re.S) #寻找img和p标签
    group = patt.findall(page)
    content_list = str(group[0]).split("<br />")
    file = docx.Document()
    for count in range(len(content_list)):
    x = 0
    if "img" in content_list[count]:
    path = "E:/SEMI_job/SEMI_Spider/pic/"
    if not os.path.isdir(path):
    os.makedirs(path)
    paths = path + '\'
    pic = re.compile('src="(.*?)"')
    pic_img = content_list[count]
    pic_url = pic.findall(pic_img)
    picurl = main_url+str(pic_url[0])
    if ' ' in picurl:
    picurl = replace(picurl)

    picurl = quote(picurl, safe=string.printable)
    pic_path = "E:/SEMI_job/SEMI_Spider/pic/%s.jpg" % x
    pic = request.urlretrieve(picurl, pic_path)
    x = x+1
    file.add_picture(pic_path, width=Inches(3.0))

    elif "strong" in content_list[count]:
    strong_font = re.compile('<strong>(.*?)</strong>')
    strong_type = strong_font.findall(content_list[count])
    p = file.add_paragraph()
    run = p.add_run(strong_type)
    # 加粗
    run.font.bold = True
    # print(strong_type)
    else:
    file.styles['Normal'].font.name = u'宋体'
    file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') #处理word里的字体样式
    content_part = content_list[count].replace('\r', '').replace('\n', '').replace('\t', '')
    file.add_paragraph(content_part)
    # print(content_part)

    file.save("E:SEMI_jobSEMI_SpiderwriteResult.docx")
    print("已处理好!")
  • 相关阅读:
    poj 1149 最大流
    poj 3281 最大流建图
    lightoj 1300 边双联通分量+交叉染色求奇圈
    lightoj 1291 无向图边双联通+缩点统计叶节点
    lightoj 1063 求割点
    lightoj 1026 无向图 求桥
    lightoj 1407 2-sat
    lightoj 1251 (Two_Sat)
    hdu 4681 最长公共子序列+枚举
    OD汇编需要标签
  • 原文地址:https://www.cnblogs.com/setname/p/10195397.html
Copyright © 2011-2022 走看看