zoukankan      html  css  js  c++  java
  • python读取pdf内容

    #!/usr/bin/env python3
    # -*- coding: utf-8 -*-
    import pdfplumber
    import time
    from tqdm import tqdm
    import pandas as pd
    
    def get_balance_table(file):
        # 获取合并资产负债表内容
        start = 0
        ret=[]
        with pdfplumber.open(file) as pdf:      
            for page in pdf.pages:
                try:
                    text = page.extract_text()
                    if '合并资产负债表' in text and '编制单位' in text:
                        start = 1
                    if start:
                        table = page.extract_table({
                            "vertical_strategy": "lines", 
                            "horizontal_strategy": "lines",
                            "explicit_vertical_lines": [],
                            "explicit_horizontal_lines": [],
                            "snap_tolerance": 3,
                            "join_tolerance": 3,
                            "edge_min_length": 3,
                            "min_words_vertical": 3,
                            "min_words_horizontal": 1,
                            "keep_blank_chars": False,
                            "text_tolerance": 3,
                            "text_x_tolerance": None,
                            "text_y_tolerance": None,
                            "intersection_tolerance": 1,
                            "intersection_x_tolerance": None,
                            "intersection_y_tolerance": None,
                        })
                        ret.extend(table)
                    if '负债和所有者权益总计' in text and '所有者权益合计' in text:
                        break
                except Exception as e:
                    print(e)
        return ret
        
    if __name__ == "__main__":
        start_time = time.time()
        print("time start:%s"%(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(start_time))))
        data = get_balance_table("1.pdf")
        df = pd.DataFrame(data)
        df=df[df.iloc[:,0].notnull()]
        df.to_excel("1.xlsx")
        end_time = time.time()
        print("time end:%s"%(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(end_time))))
        print("take: %s S"%(int(end_time-start_time)))
        
  • 相关阅读:
    docker部署遇到的问题集合【持续更新】
    docker开发常用命令
    idea使用三步曲
    防缓存穿透设计
    亿级数据库分片分库架构设计亿【转】
    java-web项目换装servlet3.1.0后性能飙升到10000tps
    spring-kafka消费者配置
    分布式disconf+spring5使用遇到重复加载的问题
    jmeter性能压测
    springboot多profile环境maven配置
  • 原文地址:https://www.cnblogs.com/boye169/p/14136463.html
Copyright © 2011-2022 走看看