zoukankan      html  css  js  c++  java
  • 爬取大半导体网新闻内容保存到word(基于python3.6)

    #!/usr/bin/python3
    # -*- coding: utf-8 -*-
    # @File : Spider
    # @Author : moucong
    # @Date : 2018/12/25 16:36
    # @Software: PyCharm
    from urllib import request
    from bs4 import BeautifulSoup
    from urllib.parse import quote
    from docx.shared import Inches
    from docx.oxml.ns import qn
    import string
    import time
    import re
    import docx
    import os




    def spider():
    url = "http://www.semi.org.cn/news/news_show.aspx?ID=54725&classid=128"
    main_url = "http://www.semi.org.cn"
    page = request.urlopen(url).read().decode('utf-8')
    # html = page.read().decode('utf-8')
    soup = BeautifulSoup(page, "lxml")
    title = soup.title.string
    title = title.replace('\n', '').replace('\t', '').replace('\r', '').replace("_SEMI大半导体产业网", '')

    patt = re.compile(r'<p>(.*?)</p>|<img (src = ".*?")>', re.S) #寻找img和p标签
    group = patt.findall(page)
    content_list = str(group[0]).split("<br />")
    file = docx.Document()
    for count in range(len(content_list)):
    x = 0
    if "img" in content_list[count]:
    path = "E:/SEMI_job/SEMI_Spider/pic/"
    if not os.path.isdir(path):
    os.makedirs(path)
    paths = path + '\'
    pic = re.compile('src="(.*?)"')
    pic_img = content_list[count]
    pic_url = pic.findall(pic_img)
    picurl = main_url+str(pic_url[0])
    if ' ' in picurl:
    picurl = replace(picurl)

    picurl = quote(picurl, safe=string.printable)
    pic_path = "E:/SEMI_job/SEMI_Spider/pic/%s.jpg" % x
    pic = request.urlretrieve(picurl, pic_path)
    x = x+1
    file.add_picture(pic_path, width=Inches(3.0))

    elif "strong" in content_list[count]:
    strong_font = re.compile('<strong>(.*?)</strong>')
    strong_type = strong_font.findall(content_list[count])
    p = file.add_paragraph()
    run = p.add_run(strong_type)
    # 加粗
    run.font.bold = True
    # print(strong_type)
    else:
    file.styles['Normal'].font.name = u'宋体'
    file.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') #处理word里的字体样式
    content_part = content_list[count].replace('\r', '').replace('\n', '').replace('\t', '')
    file.add_paragraph(content_part)
    # print(content_part)

    file.save("E:SEMI_jobSEMI_SpiderwriteResult.docx")
    print("已处理好!")
  • 相关阅读:
    怎么用代码弹回 UITableView 中左滑出来的删除按钮
    android 利用 aapt 解析 apk 得到应用名称 包名 版本号 权限等信息
    Missy
    html5 websocket + node.js 实现网页聊天室
    android 代码混淆示例
    android volley 发送 POST 请求
    android viewpager 拿到当前显示的 fragment 的实例
    android actionbar viewpager 实现类微信主界面布局
    (转)初学Git及简单搭建git服务器和客户端
    error: Cannot find OpenSSL's <evp.h> Mac
  • 原文地址:https://www.cnblogs.com/setname/p/10195397.html
Copyright © 2011-2022 走看看