zoukankan      html  css  js  c++  java
  • 《python网络数据采集》笔记1

    第一部分-创建爬虫

    1.urllib

           1)urllib.request

           request.urlopen(url)

           request.urlretrieve 可以根据文件的 URL 下载文件

           2)urllib.parse

           3)urllib.error

          

                 

    2.异常:

           try...except...else...

           常见异常:HTTPError,AttributeError,

          

    3.BeautifulSoup

           from bs4 import BeautifulSoup

           bsObj=BeautifulSoup(html,'lxml')

           1)

           print(bsObj.text)

           print(bsObj.html)

           print(bsObj.p.a)

           ...

           2)

           findAll(tag, attributes, recursive, text, limit, keywords)     #返回一个ResultSet

           find(tag, attributes, recursive, text, keywords)   #返回一个Tag

           借助它们,你可以通过标签的不同属性轻松地过滤 HTML 页面,查找需要的标签组或单个标签   

           例:

           .findAll({"h1","h2","h3","h4","h5","h6"})

           .findAll("span", {"class":{"green", "red"}})

           .findAll(id="text")   同 .findAll("", {"id":"text"})

           .findAll(src=True) 有src属性的标签

           3)

           get_text() 会把你正在处理的 HTML 文档中所有的标签都清除,然后返回一个只包含文字的str

           4)返回类型NavigatorString

           .children (所有子标签)

           .next_sibling(      下一个兄弟标签 ).next_siblings(所有之后的兄弟标签)

           .previous_sibling(上一个兄弟标签).previous(所有之前的兄弟标签)

           .parent (直接父标签 ).parents(所有父标签)、

           5)

           .attrs      获取标签所有属性(dict)

           .attrs['src']      获取src值    

           6)正则表达式

           7)lambda表达式

           #获取有两个属性的标签:

           bsObj.findAll(lambda tag: len(tag.attrs) == 2)

          

    4.Scrapy

           //TODO

    5.JSON

           把 JSON 转换成字典,

           JSON 数组转换成列表,

           JSON 字符串转换成 Python 字符串。

           常用函数:loads,get

    6.存储数据

           1)下载

                  from urllib.request import urlretrieve

                  urlretrieve(resourceLocation,fileName)

           2)CSV(Comma-Separated Values)

                  import csv

                  csvFile=open("test.csv","w+")

                  try:

                         writer=csv.writer(csvFile)

                         writer.writerow(('青山隐隐水迢迢 秋尽江南草未凋','24桥明月夜'))

                         for i in range(1,5):

                                writer.writerow((i,i+2,i*2))

                  finally:

                         csvFile.close()

           3)MySQL

                  import pymysql

                  #获取连接 获取光标

                  conn=pymysql.connect(host='localhost',user='root',passwd=None)

                  cur=conn.cursor()

                  #执行SQL语句

                  cur.execute('use ssm01')

                  cur.execute('select * from user')

                  print(cur.fetchone())#获取一条数据

                  #关闭资源

                  cur.close()

                  coon.close()

           4)Email

           //TODO

    7.读取文档

           1)读取txt

           from urllib.request import urlopen    

           txt=urlopen('http://www.pythonscraping.com/pages/warandpeace/chapter1.txt')

           print(txt.read())

          

           2)读取csv

           #从网上直接把文件读成一个字符串,然后转换成一个 StringIO 对象,使它具有文件的属性。

           from urllib.request import urlopen

           from io import StringIO

           import csv

           data = urlopen('http://pythonscraping.com/files/MontyPythonAlbums.csv').read().decode('utf-8')

           dataFile=StringIO(data)

           csvFile=csv.reader(dataFile)

           for row in csvFile:

                  print(row)

          

           3)读取PDF 

           #PDFMiner3K

           #把任意 PDF 读成字符串,然后用 StringIO 转换成文件对象

           from urllib.request import urlopen

           from pdfminer.pdfinterp import PDFResourceManager, process_pdf

           from pdfminer.converter import TextConverter

           from pdfminer.layout import LAParams

           from io import StringIO

           def readPDF(pdfFile):

                  rsrcmgr = PDFResourceManager()

                  retstr = StringIO()

                  laparams = LAParams()

                  device = TextConverter(rsrcmgr, retstr, laparams=laparams)

                  process_pdf(rsrcmgr, device, pdfFile)

                  device.close()

                  content = retstr.getvalue()

                  retstr.close()

                  return content

           pdfFile = urlopen("http://pythonscraping.com/pages/warandpeace/chapter1.pdf")

           outputString = readPDF(pdfFile)

           print(outputString)

           pdfFile.close()

    ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 

    3-1.网络数据采集

           #从 http://oreilly.com 开始,然后随机地从一个外链跳到另一个外链。

           from urllib.request import urlopen

           from bs4 import BeautifulSoup

           import re

           import datetime

           import random

           pages = set()

           random.seed(datetime.datetime.now())

           # 获取页面所有内链的列表

           def getInternalLinks(bsObj, includeUrl):

                  internalLinks = []

                  # 找出所有以"/"开头的链接

                  for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):

                         if link.attrs['href'] is not None:

                                if link.attrs['href'] not in internalLinks:

                                       internalLinks.append(link.attrs['href'])

                  return internalLinks

           # 获取页面所有外链的列表

           def getExternalLinks(bsObj, excludeUrl):

                  externalLinks = []

                  # 找出所有以"http"或"www"开头且不包含当前URL的链接

                  for link in bsObj.findAll("a",href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):

                         if link.attrs['href'] is not None:

                                if link.attrs['href'] not in externalLinks:

                                       externalLinks.append(link.attrs['href'])

                  return externalLinks

           def splitAddress(address):

                  addressParts = address.replace("http://", "").split("/")

                  return addressParts

           def getRandomExternalLink(startingPage):

                  html = urlopen(startingPage)

                  bsObj = BeautifulSoup(html,'lxml')

                  externalLinks = getExternalLinks(bsObj, splitAddress(startingPage)[0])

                  if len(externalLinks) == 0:

                         internalLinks = getInternalLinks(startingPage)

                         return getNextExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])

                  else:

                         return externalLinks[random.randint(0, len(externalLinks)-1)]

           def followExternalOnly(startingSite):

                  externalLink = getRandomExternalLink("http://oreilly.com")

                  print("随机外链是:"+externalLink)

                  followExternalOnly(externalLink)

           followExternalOnly("http://oreilly.com")

    5-1.JSON

           import json

           jsonString='{

                  "arrayOfNums":[{"number":0},{"number":1},{"number":2}],

                  "arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]

                  }'

           jsonObj=json.loads(jsonString)

           print(jsonObj.get("arrayOfFruits")[2].get("fruit"))

    6-1.把 http://pythonscraping.com 的所有图片下载下来

           from urllib.request import urlretrieve

           from urllib.request import urlopen

           from bs4 import BeautifulSoup

           def pageSrc(url):

                  html=urlopen(url)

                  bsObj=BeautifulSoup(html,'lxml')

                  srcList=bsObj.findAll("img",src=True)

                  urlList=[]

                  for i in srcList:

                         urlList.append(i['src'])

                  return urlList

           def getInternalLinks(bsObj,includeUrl):

                         internalLinks = []

                         # 找出所有以"/"开头的链接

                         for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):

                                if link.attrs['href'] is not None:

                                       if link.attrs['href'] not in internalLinks:

                                              internalLinks.append(link.attrs['href'])

                         return internalLinks

           def allimgs(url):

                  #找到该页面所有的img src

                  srcset=set()

                  for i in pageSrc(url):

                         if i not in srcset:

                                print(i)

                                srcset.add(i)

                                name=i.split('/').pop()

                                urlretrieve(i,name)

                  #找到该页面的所有内链

                  html=urlopen(url)

                  bsObj=BeautifulSoup(html,'lxml')

                  for i in getInternalLinks(bsObj,url):

                         newUrl=url+i

                         for j in pageSrc(newUrl):

                                if j not in srcset:

                                       srcset.add(i)

                                       print(j)

                                       name=j.split('/').pop()

                                       urlretrieve(j,name)

           url="http://pythonscraping.com"

           allimgs(url)

    6-2.存储到CSV

           #获取 HTML 表格并写入 CSV 文件

           import csv

           from urllib.request import urlopen

           from bs4 import BeautifulSoup

           html = urlopen("http://en.wikipedia.org/wiki/Comparison_of_text_editors")

           bsObj = BeautifulSoup(html,'lxml')

           # 主对比表格是当前页面上的第一个表格

           table = bsObj.findAll("table",{"class":"wikitable"})[0]

           rows = table.findAll("tr")

           csvFile = open("editors.csv", 'wt', newline='',encoding='utf-8')

           writer = csv.writer(csvFile)

           try:

                  for row in rows:

                         csvRow = []

                         for cell in row.findAll(['td', 'th']):

                                csvRow.append(cell.get_text()[:-1])

                         print(csvRow)

                         writer.writerow(csvRow)

           finally:

                  csvFile.close()

    6-3.存储到mysql

           #存储维基百科数据

           from urllib.request import urlopen

           from bs4 import BeautifulSoup

           import re

           import datetime

           import random

           import pymysql

           conn = pymysql.connect(host='127.0.0.1',user='root', passwd=None, charset='utf8')

           cur = conn.cursor()

           cur.execute("USE ssm01")

           cur.execute("CREATE TABLE pages(title varchar(200),content varchar(3000))")

           random.seed(datetime.datetime.now())

           #存储到数据库

           def store(title, content):

                  cur.execute("INSERT INTO pages (title, content) VALUES ("%s","%s")", (title, content))

                  cur.connection.commit()

           #找到数据 存储到数据库

           def getLinks(articleUrl):

                  html = urlopen("http://en.wikipedia.org"+articleUrl)

                  bsObj = BeautifulSoup(html,'lxml')

                  title = bsObj.find("h1").get_text()

                  content = bsObj.find("div", {"id":"mw-content-text"}).find("p").get_text()

                  store(title, content)

                  return bsObj.find("div", {"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))

           links = getLinks("/wiki/Kevin_Bacon")

           try:

                  while len(links) > 0:

                         newArticle = links[random.randint(0, len(links)-1)].attrs["href"]

                         print(newArticle)

                         links = getLinks(newArticle)

           finally:

                  cur.close()

                  conn.close()

  • 相关阅读:
    用csc命令行手动编译cs文件
    笔录---果壳中的C#第一章
    Visual Studio2012快捷键总结
    JavaScript 二维数组排列组合2
    JavaScript 递归法排列组合二维数组2
    JavaScript 递归法排列组合二维数组
    JavaScript 二维数组排列组合
    在 CentOS6 上安装 GraphicsMagick-1.3.30
    Execution default-resources of goal org.apache.maven.plugins:maven-resources-plugin:2.6:resources failed: Unable to load the mojo 'resources' (or one of its required components)
    java.sql.SQLException: Column count doesn't match value count at row 1
  • 原文地址:https://www.cnblogs.com/mznsndy/p/11697105.html
Copyright © 2011-2022 走看看