#
# 保存csv格式的数据
import csv csvFile = open('test.csv','w+',newline='') #文本方式可读写 try: writer = csv.writer(csvFile) writer.writerow(('num','num+2','num*2')) for i in range(10): writer.writerow((i,i+2,i*2)) finally: csvFile.close()
#
# mysql python操作 import pymysql #导包 conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', password=None, db='mysql',chaset='utf8') #创建连接 cur = conn.cursor() #创建游标 cur.execute("USE scraping") #执行 使用库 cur.execute("SELECT * FROM pages WHERE id=1") #执行语句 print(cur.fetchone()) #获取单条数据 cur.close() #游标关闭 conn.close() #连接关闭
#
str = bytes(value=b'', encoding=None) #指定编码
from urllib.request import urlopen from io import StringIO #字符串的缓存 import csv data = urlopen("http://pythonscraping.com/files/MontyPythonAlbums.csv").read().decode('ascii', 'ignore') dataFile = StringIO(data) csvReader = csv.reader(dataFile) for row in csvReader: print("The album ""+row[0]+"" was released in "+str(row[1]))
#pdfminer3k from pdfminer.pdfinterp import PDFResourceManager, process_pdf from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from io import StringIO from io import open from urllib.request import urlopen def readPDF(pdfFile): rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() device = TextConverter(rsrcmgr, retstr, laparams=laparams) process_pdf(rsrcmgr, device, pdfFile) device.close() content = retstr.getvalue() retstr.close() return content pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf") outputString = readPDF(pdfFile) print(outputString) pdfFile.close()
from zipfile import ZipFile #docx from urllib.request import urlopen from io import BytesIO from bs4 import BeautifulSoup wordFile = urlopen("http://pythonscraping.com/pages/AWordDocument.docx").read() wordFile = BytesIO(wordFile) document = ZipFile(wordFile) xml_content = document.read('word/document.xml') wordObj = BeautifulSoup(xml_content.decode('utf-8'), "lxml-xml") textStrings = wordObj.findAll("w:t") for textElem in textStrings: closeTag = "" try: style = textElem.parent.previousSibling.find("w:pStyle") if style is not None and style["w:val"] == "Title": print("<h1>") closeTag = "</h1>" except AttributeError: #不打印标签 pass print(textElem.text) print(closeTag)