zoukankan      html  css  js  c++  java
  • 《python网络数据采集》笔记1

    第一部分-创建爬虫

    1.urllib

           1)urllib.request

           request.urlopen(url)

           request.urlretrieve 可以根据文件的 URL 下载文件

           2)urllib.parse

           3)urllib.error

          

                 

    2.异常:

           try...except...else...

           常见异常:HTTPError,AttributeError,

          

    3.BeautifulSoup

           from bs4 import BeautifulSoup

           bsObj=BeautifulSoup(html,'lxml')

           1)

           print(bsObj.text)

           print(bsObj.html)

           print(bsObj.p.a)

           ...

           2)

           findAll(tag, attributes, recursive, text, limit, keywords)     #返回一个ResultSet

           find(tag, attributes, recursive, text, keywords)   #返回一个Tag

           借助它们,你可以通过标签的不同属性轻松地过滤 HTML 页面,查找需要的标签组或单个标签   

           例:

           .findAll({"h1","h2","h3","h4","h5","h6"})

           .findAll("span", {"class":{"green", "red"}})

           .findAll(id="text")   同 .findAll("", {"id":"text"})

           .findAll(src=True) 有src属性的标签

           3)

           get_text() 会把你正在处理的 HTML 文档中所有的标签都清除,然后返回一个只包含文字的str

           4)返回类型NavigatorString

           .children (所有子标签)

           .next_sibling(      下一个兄弟标签 ).next_siblings(所有之后的兄弟标签)

           .previous_sibling(上一个兄弟标签).previous(所有之前的兄弟标签)

           .parent (直接父标签 ).parents(所有父标签)、

           5)

           .attrs      获取标签所有属性(dict)

           .attrs['src']      获取src值    

           6)正则表达式

           7)lambda表达式

           #获取有两个属性的标签:

           bsObj.findAll(lambda tag: len(tag.attrs) == 2)

          

    4.Scrapy

           //TODO

    5.JSON

           把 JSON 转换成字典,

           JSON 数组转换成列表,

           JSON 字符串转换成 Python 字符串。

           常用函数:loads,get

    6.存储数据

           1)下载

                  from urllib.request import urlretrieve

                  urlretrieve(resourceLocation,fileName)

           2)CSV(Comma-Separated Values)

                  import csv

                  csvFile=open("test.csv","w+")

                  try:

                         writer=csv.writer(csvFile)

                         writer.writerow(('青山隐隐水迢迢 秋尽江南草未凋','24桥明月夜'))

                         for i in range(1,5):

                                writer.writerow((i,i+2,i*2))

                  finally:

                         csvFile.close()

           3)MySQL

                  import pymysql

                  #获取连接 获取光标

                  conn=pymysql.connect(host='localhost',user='root',passwd=None)

                  cur=conn.cursor()

                  #执行SQL语句

                  cur.execute('use ssm01')

                  cur.execute('select * from user')

                  print(cur.fetchone())#获取一条数据

                  #关闭资源

                  cur.close()

                  coon.close()

           4)Email

           //TODO

    7.读取文档

           1)读取txt

           from urllib.request import urlopen    

           txt=urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1.txt')

           print(txt.read())

          

           2)读取csv

           #从网上直接把文件读成一个字符串,然后转换成一个 StringIO 对象,使它具有文件的属性。

           from urllib.request import urlopen

           from io import StringIO

           import csv

           data = urlopen('http://pythonscraping.com/files/MontyPythonAlbums.csv').read().decode('utf-8')

           dataFile=StringIO(data)

           csvFile=csv.reader(dataFile)

           for row in csvFile:

                  print(row)

          

           3)读取PDF 

           #PDFMiner3K

           #把任意 PDF 读成字符串,然后用 StringIO 转换成文件对象

           from urllib.request import urlopen

           from pdfminer.pdfinterp import PDFResourceManager, process_pdf

           from pdfminer.converter import TextConverter

           from pdfminer.layout import LAParams

           from io import StringIO

           def readPDF(pdfFile):

                  rsrcmgr = PDFResourceManager()

                  retstr = StringIO()

                  laparams = LAParams()

                  device = TextConverter(rsrcmgr, retstr, laparams=laparams)

                  process_pdf(rsrcmgr, device, pdfFile)

                  device.close()

                  content = retstr.getvalue()

                  retstr.close()

                  return content

           pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")

           outputString = readPDF(pdfFile)

           print(outputString)

           pdfFile.close()

    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 

    3-1.网络数据采集

           #从 http://oreilly.com 开始,然后随机地从一个外链跳到另一个外链。

           from urllib.request import urlopen

           from bs4 import BeautifulSoup

           import re

           import datetime

           import random

           pages = set()

           random.seed(datetime.datetime.now())

           # 获取页面所有内链的列表

           def getInternalLinks(bsObj, includeUrl):

                  internalLinks = []

                  # 找出所有以"/"开头的链接

                  for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):

                         if link.attrs['href'] is not None:

                                if link.attrs['href'] not in internalLinks:

                                       internalLinks.append(link.attrs['href'])

                  return internalLinks

           # 获取页面所有外链的列表

           def getExternalLinks(bsObj, excludeUrl):

                  externalLinks = []

                  # 找出所有以"http"或"www"开头且不包含当前URL的链接

                  for link in bsObj.findAll("a",href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):

                         if link.attrs['href'] is not None:

                                if link.attrs['href'] not in externalLinks:

                                       externalLinks.append(link.attrs['href'])

                  return externalLinks

           def splitAddress(address):

                  addressParts = address.replace("http://", "").split("/")

                  return addressParts

           def getRandomExternalLink(startingPage):

                  html = urlopen(startingPage)

                  bsObj = BeautifulSoup(html,'lxml')

                  externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])

                  if len(externalLinks) == 0:

                         internalLinks = getInternalLinks(startingPage)

                         return getNextExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])

                  else:

                         return externalLinks[random.randint(0, len(externalLinks)-1)]

           def followExternalOnly(startingSite):

                  externalLink = getRandomExternalLink("http://oreilly.com")

                  print("随机外链是:"+externalLink)

                  followExternalOnly(externalLink)

           followExternalOnly("http://oreilly.com")

    5-1.JSON

           import json

           jsonString='{

                  "arrayOfNums":[{"number":0},{"number":1},{"number":2}],

                  "arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]

                  }'

           jsonObj=json.loads(jsonString)

           print(jsonObj.get("arrayOfFruits")[2].get("fruit"))

    6-1.把 http://pythonscraping.com 的所有图片下载下来

           from urllib.request import urlretrieve

           from urllib.request import urlopen

           from bs4 import BeautifulSoup

           def pageSrc(url):

                  html=urlopen(url)

                  bsObj=BeautifulSoup(html,'lxml')

                  srcList=bsObj.findAll("img",src=True)

                  urlList=[]

                  for i in srcList:

                         urlList.append(i['src'])

                  return urlList

           def getInternalLinks(bsObj,includeUrl):

                         internalLinks = []

                         # 找出所有以"/"开头的链接

                         for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):

                                if link.attrs['href'] is not None:

                                       if link.attrs['href'] not in internalLinks:

                                              internalLinks.append(link.attrs['href'])

                         return internalLinks

           def allimgs(url):

                  #找到该页面所有的img src

                  srcset=set()

                  for i in pageSrc(url):

                         if i not in srcset:

                                print(i)

                                srcset.add(i)

                                name=i.split('/').pop()

                                urlretrieve(i,name)

                  #找到该页面的所有内链

                  html=urlopen(url)

                  bsObj=BeautifulSoup(html,'lxml')

                  for i in getInternalLinks(bsObj,url):

                         newUrl=url+i

                         for j in pageSrc(newUrl):

                                if j not in srcset:

                                       srcset.add(i)

                                       print(j)

                                       name=j.split('/').pop()

                                       urlretrieve(j,name)

           url="http://pythonscraping.com"

           allimgs(url)

    6-2.存储到CSV

           #获取 HTML 表格并写入 CSV 文件

           import csv

           from urllib.request import urlopen

           from bs4 import BeautifulSoup

           html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")

           bsObj = BeautifulSoup(html,'lxml')

           # 主对比表格是当前页面上的第一个表格

           table = bsObj.findAll("table",{"class":"wikitable"})[0]

           rows = table.findAll("tr")

           csvFile = open("editors.csv", 'wt', newline='',encoding='utf-8')

           writer = csv.writer(csvFile)

           try:

                  for row in rows:

                         csvRow = []

                         for cell in row.findAll(['td', 'th']):

                                csvRow.append(cell.get_text()[:-1])

                         print(csvRow)

                         writer.writerow(csvRow)

           finally:

                  csvFile.close()

    6-3.存储到mysql

           #存储维基百科数据

           from urllib.request import urlopen

           from bs4 import BeautifulSoup

           import re

           import datetime

           import random

           import pymysql

           conn = pymysql.connect(host='127.0.0.1',user='root', passwd=None, charset='utf8')

           cur = conn.cursor()

           cur.execute("USE ssm01")

           cur.execute("CREATE TABLE pages(title varchar(200),content varchar(3000))")

           random.seed(datetime.datetime.now())

           #存储到数据库

           def store(title, content):

                  cur.execute("INSERT INTO pages (title, content) VALUES ("%s","%s")", (title, content))

                  cur.connection.commit()

           #找到数据 存储到数据库

           def getLinks(articleUrl):

                  html = urlopen("http://en.wikipedia.org"+articleUrl)

                  bsObj = BeautifulSoup(html,'lxml')

                  title = bsObj.find("h1").get_text()

                  content = bsObj.find("div", {"id":"mw-content-text"}).find("p").get_text()

                  store(title, content)

                  return bsObj.find("div", {"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))

           links = getLinks("/wiki/Kevin_Bacon")

           try:

                  while len(links) > 0:

                         newArticle = links[random.randint(0, len(links)-1)].attrs["href"]

                         print(newArticle)

                         links = getLinks(newArticle)

           finally:

                  cur.close()

                  conn.close()

  • 相关阅读:
    一些业内有名的网站收集
    WCF重载
    FCKEditor fckconfig.js配置,添加字体和大小 附:中文字体乱码问题解决
    查询第几条到第几条的数据的SQL语句
    SPOJ 9939 Eliminate the Conflict
    UVA 10534 Wavio Sequence
    HDU 3474 Necklace
    POJ 2823 Sliding Window
    UVA 437 The Tower of Babylon
    UVA 825 Walking on the Safe Side
  • 原文地址:https://www.cnblogs.com/mznsndy/p/11697105.html
Copyright © 2011-2022 走看看