《python网络数据采集》笔记1

zoukankan html css js c++ java

《python网络数据采集》笔记1

第一部分-创建爬虫

1.urllib

       1)urllib.request

       request.urlopen(url)

       request.urlretrieve 可以根据文件的 URL 下载文件

       2)urllib.parse

       3)urllib.error





2.异常：

       try...except...else...

       常见异常：HTTPError,AttributeError,



3.BeautifulSoup

       from bs4 import BeautifulSoup

       bsObj=BeautifulSoup(html,'lxml')

       1)

       print(bsObj.text)

       print(bsObj.html)

       print(bsObj.p.a)

       ...

       2)

       findAll(tag, attributes, recursive, text, limit, keywords)     #返回一个ResultSet

       find(tag, attributes, recursive, text, keywords)   #返回一个Tag

       借助它们,你可以通过标签的不同属性轻松地过滤 HTML 页面，查找需要的标签组或单个标签

       例：

       .findAll({"h1","h2","h3","h4","h5","h6"})

       .findAll("span", {"class":{"green", "red"}})

       .findAll(id="text")   同 .findAll("", {"id":"text"})

       .findAll(src=True) 有src属性的标签

       3)

       get_text() 会把你正在处理的 HTML 文档中所有的标签都清除，然后返回一个只包含文字的str

       4)返回类型NavigatorString

       .children （所有子标签）

       .next_sibling（      下一个兄弟标签）.next_siblings（所有之后的兄弟标签）

       .previous_sibling（上一个兄弟标签）.previous（所有之前的兄弟标签）

       .parent （直接父标签）.parents（所有父标签）、

       5）

       .attrs      获取标签所有属性(dict)

       .attrs['src']      获取src值

       6)正则表达式

       7)lambda表达式

       #获取有两个属性的标签：

       bsObj.findAll(lambda tag: len(tag.attrs) == 2)



4.Scrapy

       //TODO

5.JSON

       把 JSON 转换成字典，

       JSON 数组转换成列表，

       JSON 字符串转换成 Python 字符串。

       常用函数：loads,get

6.存储数据

       1)下载

              from urllib.request import urlretrieve

              urlretrieve(resourceLocation,fileName)

       2）CSV（Comma-Separated Values）

              import csv

              csvFile=open("test.csv","w+")

              try:

                     writer=csv.writer(csvFile)

                     writer.writerow(('青山隐隐水迢迢秋尽江南草未凋','24桥明月夜'))

                     for i in range(1,5):

                            writer.writerow((i,i+2,i*2))

              finally:

                     csvFile.close()

       3）MySQL

              import pymysql

              #获取连接获取光标

              conn=pymysql.connect(host='localhost',user='root',passwd=None)

              cur=conn.cursor()

              #执行SQL语句

              cur.execute('use ssm01')

              cur.execute('select * from user')

              print(cur.fetchone())#获取一条数据

              #关闭资源

              cur.close()

              coon.close()

       4）Email

       //TODO

7.读取文档

       1)读取txt

       from urllib.request import urlopen

       txt=urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1.txt')

       print(txt.read())



       2)读取csv

       #从网上直接把文件读成一个字符串，然后转换成一个 StringIO 对象，使它具有文件的属性。

       from urllib.request import urlopen

       from io import StringIO

       import csv

       data = urlopen('http://pythonscraping.com/files/MontyPythonAlbums.csv').read().decode('utf-8')

       dataFile=StringIO(data)

       csvFile=csv.reader(dataFile)

       for row in csvFile:

              print(row)



       3）读取PDF

       #PDFMiner3K

       #把任意 PDF 读成字符串，然后用 StringIO 转换成文件对象

       from urllib.request import urlopen

       from pdfminer.pdfinterp import PDFResourceManager, process_pdf

       from pdfminer.converter import TextConverter

       from pdfminer.layout import LAParams

       from io import StringIO

       def readPDF(pdfFile):

              rsrcmgr = PDFResourceManager()

              retstr = StringIO()

              laparams = LAParams()

              device = TextConverter(rsrcmgr, retstr, laparams=laparams)

              process_pdf(rsrcmgr, device, pdfFile)

              device.close()

              content = retstr.getvalue()

              retstr.close()

              return content

       pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")

       outputString = readPDF(pdfFile)

       print(outputString)

       pdfFile.close()

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

例

3-1.网络数据采集

       #从 http://oreilly.com 开始，然后随机地从一个外链跳到另一个外链。

       from urllib.request import urlopen

       from bs4 import BeautifulSoup

       import re

       import datetime

       import random

       pages = set()

       random.seed(datetime.datetime.now())

       # 获取页面所有内链的列表

       def getInternalLinks(bsObj, includeUrl):

              internalLinks = []

              # 找出所有以"/"开头的链接

              for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):

                     if link.attrs['href'] is not None:

                            if link.attrs['href'] not in internalLinks:

                                   internalLinks.append(link.attrs['href'])

              return internalLinks

       # 获取页面所有外链的列表

       def getExternalLinks(bsObj, excludeUrl):

              externalLinks = []

              # 找出所有以"http"或"www"开头且不包含当前URL的链接

              for link in bsObj.findAll("a",href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):

                     if link.attrs['href'] is not None:

                            if link.attrs['href'] not in externalLinks:

                                   externalLinks.append(link.attrs['href'])

              return externalLinks

       def splitAddress(address):

              addressParts = address.replace("http://", "").split("/")

              return addressParts

       def getRandomExternalLink(startingPage):

              html = urlopen(startingPage)

              bsObj = BeautifulSoup(html,'lxml')

              externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])

              if len(externalLinks) == 0:

                     internalLinks = getInternalLinks(startingPage)

                     return getNextExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])

              else:

                     return externalLinks[random.randint(0, len(externalLinks)-1)]

       def followExternalOnly(startingSite):

              externalLink = getRandomExternalLink("http://oreilly.com")

              print("随机外链是："+externalLink)

              followExternalOnly(externalLink)

       followExternalOnly("http://oreilly.com")

5-1.JSON

       import json

       jsonString='{

              "arrayOfNums":[{"number":0},{"number":1},{"number":2}],

              "arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]

              }'

       jsonObj=json.loads(jsonString)

       print(jsonObj.get("arrayOfFruits")[2].get("fruit"))

6-1.把 http://pythonscraping.com 的所有图片下载下来

       from urllib.request import urlretrieve

       from urllib.request import urlopen

       from bs4 import BeautifulSoup

       def pageSrc(url):

              html=urlopen(url)

              bsObj=BeautifulSoup(html,'lxml')

              srcList=bsObj.findAll("img",src=True)

              urlList=[]

              for i in srcList:

                     urlList.append(i['src'])

              return urlList

       def getInternalLinks(bsObj,includeUrl):

                     internalLinks = []

                     # 找出所有以"/"开头的链接

                     for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):

                            if link.attrs['href'] is not None:

                                   if link.attrs['href'] not in internalLinks:

                                          internalLinks.append(link.attrs['href'])

                     return internalLinks

       def allimgs(url):

              #找到该页面所有的img src

              srcset=set()

              for i in pageSrc(url):

                     if i not in srcset:

                            print(i)

                            srcset.add(i)

                            name=i.split('/').pop()

                            urlretrieve(i,name)

              #找到该页面的所有内链

              html=urlopen(url)

              bsObj=BeautifulSoup(html,'lxml')

              for i in getInternalLinks(bsObj,url):

                     newUrl=url+i

                     for j in pageSrc(newUrl):

                            if j not in srcset:

                                   srcset.add(i)

                                   print(j)

                                   name=j.split('/').pop()

                                   urlretrieve(j,name)

       url="http://pythonscraping.com"

       allimgs(url)

6-2.存储到CSV

       #获取 HTML 表格并写入 CSV 文件

       import csv

       from urllib.request import urlopen

       from bs4 import BeautifulSoup

       html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")

       bsObj = BeautifulSoup(html,'lxml')

       # 主对比表格是当前页面上的第一个表格

       table = bsObj.findAll("table",{"class":"wikitable"})[0]

       rows = table.findAll("tr")

       csvFile = open("editors.csv", 'wt', newline='',encoding='utf-8')

       writer = csv.writer(csvFile)

       try:

              for row in rows:

                     csvRow = []

                     for cell in row.findAll(['td', 'th']):

                            csvRow.append(cell.get_text()[:-1])

                     print(csvRow)

                     writer.writerow(csvRow)

       finally:

              csvFile.close()

6-3.存储到mysql

       #存储维基百科数据

       from urllib.request import urlopen

       from bs4 import BeautifulSoup

       import re

       import datetime

       import random

       import pymysql

       conn = pymysql.connect(host='127.0.0.1',user='root', passwd=None, charset='utf8')

       cur = conn.cursor()

       cur.execute("USE ssm01")

       cur.execute("CREATE TABLE pages(title varchar(200),content varchar(3000))")

       random.seed(datetime.datetime.now())

       #存储到数据库

       def store(title, content):

              cur.execute("INSERT INTO pages (title, content) VALUES ("%s","%s")", (title, content))

              cur.connection.commit()

       #找到数据存储到数据库

       def getLinks(articleUrl):

              html = urlopen("http://en.wikipedia.org"+articleUrl)

              bsObj = BeautifulSoup(html,'lxml')

              title = bsObj.find("h1").get_text()

              content = bsObj.find("div", {"id":"mw-content-text"}).find("p").get_text()

              store(title, content)

              return bsObj.find("div", {"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))

       links = getLinks("/wiki/Kevin_Bacon")

       try:

              while len(links) > 0:

                     newArticle = links[random.randint(0, len(links)-1)].attrs["href"]

                     print(newArticle)

                     links = getLinks(newArticle)

       finally:

              cur.close()

              conn.close()

查看全文

相关阅读:
一些业内有名的网站收集
 WCF重载
 FCKEditor fckconfig.js配置，添加字体和大小附：中文字体乱码问题解决
 查询第几条到第几条的数据的SQL语句
 SPOJ 9939 Eliminate the Conﬂict
UVA 10534 Wavio Sequence
HDU 3474 Necklace
POJ 2823 Sliding Window
UVA 437 The Tower of Babylon
UVA 825 Walking on the Safe Side

原文地址：https://www.cnblogs.com/mznsndy/p/11697105.html