zoukankan      html  css  js  c++  java
  • 爬取大半导体网新闻内容保存到word(基于python3.6)

    #!/usr/bin/python3
    # -*- coding: utf-8 -*-
    # @File : Spider
    # @Author : moucong
    # @Date : 2018/12/25 16:36
    # @Software: PyCharm
    from urllib import request
    from bs4 import BeautifulSoup
    from urllib.parse import quote
    from docx.shared import Inches
    from docx.oxml.ns import qn
    import string
    import time
    import re
    import docx
    import os




    def spider():
    url = "http://www.semi.org.cn/news/news_show.aspx?ID=54725&classid=128"
    main_url = "http://www.semi.org.cn"
    page = request.urlopen(url).read().decode('utf-8')
    # html = page.read().decode('utf-8')
    soup = BeautifulSoup(page, "lxml")
    title = soup.title.string
    title = title.replace('\n', '').replace('\t', '').replace('\r', '').replace("_SEMI大半导体产业网", '')

    patt = re.compile(r'<p>(.*?)</p>|<img (src = ".*?")>', re.S) #寻找img和p标签
    group = patt.findall(page)
    content_list = str(group[0]).split("<br />")
    file = docx.Document()
    for count in range(len(content_list)):
    x = 0
    if "img" in content_list[count]:
    path = "E:/SEMI_job/SEMI_Spider/pic/"
    if not os.path.isdir(path):
    os.makedirs(path)
    paths = path + '\'
    pic = re.compile('src="(.*?)"')
    pic_img = content_list[count]
    pic_url = pic.findall(pic_img)
    picurl = main_url+str(pic_url[0])
    if ' ' in picurl:
    picurl = replace(picurl)

    picurl = quote(picurl, safe=string.printable)
    pic_path = "E:/SEMI_job/SEMI_Spider/pic/%s.jpg" % x
    pic = request.urlretrieve(picurl, pic_path)
    x = x+1
    file.add_picture(pic_path, width=Inches(3.0))

    elif "strong" in content_list[count]:
    strong_font = re.compile('<strong>(.*?)</strong>')
    strong_type = strong_font.findall(content_list[count])
    p = file.add_paragraph()
    run = p.add_run(strong_type)
    # 加粗
    run.font.bold = True
    # print(strong_type)
    else:
    file.styles['Normal'].font.name = u'宋体'
    file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') #处理word里的字体样式
    content_part = content_list[count].replace('\r', '').replace('\n', '').replace('\t', '')
    file.add_paragraph(content_part)
    # print(content_part)

    file.save("E:SEMI_jobSEMI_SpiderwriteResult.docx")
    print("已处理好!")
  • 相关阅读:
    mysql
    MySQL主从同步
    python与各数据库的交互
    snmptrap
    web场景的监控
    zabbix的历史数据存储到elasticsearch中
    使用PopupWindow的实现步骤
    使用PopupWindow的实现步骤
    ListView及其ArrayAdapter的应用
    ListView及其ArrayAdapter的应用
  • 原文地址:https://www.cnblogs.com/setname/p/10195397.html
Copyright © 2011-2022 走看看