zoukankan      html  css  js  c++  java
  • 数据存储 csv

    #

    # 保存csv格式的数据
    import
    csv csvFile = open('test.csv','w+',newline='') #文本方式可读写 try: writer = csv.writer(csvFile) writer.writerow(('num','num+2','num*2')) for i in range(10): writer.writerow((i,i+2,i*2)) finally: csvFile.close()

    #

    # mysql python操作
    import pymysql  #导包
    conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', password=None, db='mysql',chaset='utf8') #创建连接
    cur = conn.cursor() #创建游标
    cur.execute("USE scraping") #执行 使用库
    cur.execute("SELECT * FROM pages WHERE id=1") #执行语句
    print(cur.fetchone())  #获取单条数据
    cur.close()  #游标关闭
    conn.close() #连接关闭

    #

    str = bytes(value=b'', encoding=None)  #指定编码
    from urllib.request import urlopen
    from io import StringIO  #字符串的缓存
    import csv
    
    data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii', 'ignore')
    dataFile = StringIO(data)
    csvReader = csv.reader(dataFile)
    
    for row in csvReader:
        print("The album ""+row[0]+"" was released in "+str(row[1]))
    #pdfminer3k
    from pdfminer.pdfinterp import PDFResourceManager, process_pdf
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from io import StringIO
    from io import open
    from urllib.request import urlopen
    
    def readPDF(pdfFile):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    
        process_pdf(rsrcmgr, device, pdfFile)
        device.close()
    
        content = retstr.getvalue()
        retstr.close()
        return content
    
    pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")
    outputString = readPDF(pdfFile)
    print(outputString)
    pdfFile.close()
    from zipfile import ZipFile  #docx
    from urllib.request import urlopen
    from io import BytesIO
    from bs4 import BeautifulSoup
    
    wordFile = urlopen("http://pythonscraping.com/pages/AWordDocument.docx").read()
    wordFile = BytesIO(wordFile)
    document = ZipFile(wordFile)
    xml_content = document.read('word/document.xml')
    
    wordObj = BeautifulSoup(xml_content.decode('utf-8'), "lxml-xml")
    textStrings = wordObj.findAll("w:t")
    for textElem in textStrings:
        closeTag = ""
        try:
            style = textElem.parent.previousSibling.find("w:pStyle")
            if style is not None and style["w:val"] == "Title":
                print("<h1>")
                closeTag = "</h1>"
        except AttributeError: #不打印标签
            pass 
        print(textElem.text)
        print(closeTag)
  • 相关阅读:
    kali长时间未使用导致数字签名过期无法更新源解决办法
    4.爬虫去重策略
    3.编码问题
    kalinux 五笔安装
    ★★★kalinux 常用命令
    安装vm tools时出现如下问题 The path "/usr/bin/gcc" is not valid path to the
    kalinux实现自适用全屏、与物理主机共享文件方法
    wifi pj WiFiPhisher 安装使用
    条款20:在传递对象的时候尽量用reference-to-constent来代替,pass-by-value
    条款19:定义class就相当于定义一个个的内置类型
  • 原文地址:https://www.cnblogs.com/zhangchen-sx/p/11160882.html
Copyright © 2011-2022 走看看