zoukankan      html  css  js  c++  java
  • python批量获取公众号图片生成PDF文件

    一、获取公众号图片

    需要安装的包
    1、pip install bs4
    2、pip install requests
     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     3 # Author: KaiSun
     4 
     5 import requests
     6 from bs4 import BeautifulSoup
     7 import re
     8 import os
     9 
    10 #获取网页信息
    11 def getHTMLText(url):
    12     try:
    13         r=requests.get(url,timeout=30)
    14         r.raise_for_status()
    15         r.encoding=r.apparent_encoding
    16         return r.text
    17     except:
    18         return ""
    19 
    20 #解析网页,获取所有图片url
    21 def getimgURL(html):
    22     soup = BeautifulSoup(html , "html.parser")
    23     adlist=[]
    24     for i in soup.find_all("img"):
    25         try:
    26             ad= re.findall(r'.*src="(.*?)?" .*',str(i))
    27             if ad :
    28                 adlist.append(ad)
    29         except:
    30             continue
    31     return adlist
    32 
    33 #新建文件夹pic,下载并保存爬取的图片信息
    34 def download(adlist):
    35     #注意更改文件目录
    36     root="/Users/sunkai/study_way/爬虫/picture/"
    37     for i in range(len(adlist)):
    38         path=root+str(i)+"."+'png'
    39         if not os.path.exists(root):
    40             os.mkdir(root)
    41         if not os.path.exists(path):
    42             if adlist[i][0]:
    43                 r=requests.get(adlist[i][0])
    44                 with open(path,'wb') as f:
    45                     f.write(r.content)
    46                     f.close()
    47 
    48 def main():
    49     url = 'https://mp.weixin.qq.com/s/Jy5bUXb4aOmzEoPe6WODJA'
    50     html=getHTMLText(url)
    51     list=getimgURL(html)
    52     download(list)
    53 main()

    二、生成PDF文件

     1 #!/usr/bin/env python
     2 # -*- coding: utf-8 -*-
     3 # Author: KaiSun
     4 
     5 
     6 #  简单生成方式
     7 import os
     8 
     9 from reportlab.lib.pagesizes import A4, portrait, landscape
    10 from reportlab.pdfgen import canvas
    11 
    12 def convert_images_to_pdf(img_path, pdf_path):
    13     pages = 0
    14     (w, h) = portrait(A4)
    15     c = canvas.Canvas(pdf_path, pagesize = portrait(A4))
    16     l = os.listdir(img_path)
    17     l.sort(key= lambda x:int(x[:-4]))
    18     for i in l:
    19         f = img_path + os.sep + str(i)
    20         c.drawImage(f, 0, 0, w, h)
    21         c.showPage()
    22         pages = pages + 1
    23     c.save()
    24 
    25 convert_images_to_pdf('/Users/sunkai/study_way/爬虫/picture/',
    26                       '/Users/sunkai/study_way/爬虫/result.pdf')
    27 
    28 # 根据不同尺寸生成
    29 import os, shutil
    30 from PIL import Image
    31 from reportlab.lib.pagesizes import A4, portrait, landscape
    32 from reportlab.pdfgen import canvas
    33 from PyPDF2 import PdfFileWriter, PdfFileReader
    34 def convert_image_to_pdf(img_path, pdf_path):
    35     img = Image.open(img_path)
    36     (w0, h0) = img.size
    37     print(w0, h0)
    38     if w0 > h0:
    39         (w, h) = landscape(A4)
    40         c = canvas.Canvas(pdf_path, pagesize = landscape(A4))
    41         c.drawImage(img_path, 0, 0, w, h)
    42         c.showPage()
    43         c.save()
    44     else:
    45         (w, h) = portrait(A4)
    46         c = canvas.Canvas(pdf_path, pagesize = portrait(A4))
    47         c.drawImage(img_path, 0, 0, w, h)
    48         c.showPage()
    49         c.save()
    50 
    51 def convert_images_to_pdf(img_path, pdf_path):
    52     pages = 0
    53     tmp_path = '.' + os.sep + 'temp'
    54     if not os.path.exists(tmp_path):
    55         os.mkdir(tmp_path)
    56     list = os.listdir(img_path)
    57     list.sort(key=lambda x:int(x[:-4]))
    58     output = PdfFileWriter()
    59     for item in list:
    60         img = img_path + os.sep + str(item)
    61         pdf = tmp_path + os.sep + str(pages + 1) + ".pdf"
    62         convert_image_to_pdf(img, pdf)
    63         input = PdfFileReader(open(pdf, "rb"))
    64         pageCount = input.getNumPages()
    65         pages = pages + 1
    66         for iPage in range(0, pageCount):
    67             output.addPage(input.getPage(iPage))
    68     outputStream = open(pdf_path, "wb")
    69     output.write(outputStream)
    70     outputStream.close()
    71     shutil.rmtree(tmp_path)
    72 
    73 
    74 convert_images_to_pdf('/Users/sunkai/study_way/爬虫/picture/',
    75                       '/Users/sunkai/study_way/爬虫/result.pdf')
     
  • 相关阅读:
    【散列表】拉链法以及线性探查法
    【Spring】第一个hello world程序
    『Java基础』基本数据类型和包装类
    二进制、八进制、十进制、十六进制互转
    [MyBatis]缓存机制
    Ora-01830 日期格式图片在转换整个输入字符串之前结束
    leetcode1584. 连接所有点的最小费用(最小生成树算法的应用)
    VS安装SVN插件
    Winform打开不窗体设计器解决方法(Winform以普通C#类显示,打开不了设计器)
    Oracle修改System密码并解锁
  • 原文地址:https://www.cnblogs.com/sunkai1993/p/14371551.html
Copyright © 2011-2022 走看看