zoukankan      html  css  js  c++  java
  • url请求,request请求,解析库beauitfulsoup,解析库lxml

    url请求

    1 from urllib.request import urlopen
    2 url="****"
    3 respones = urlopen(url)
    4 content =  respones.read()
    5 content = content.decode('utf-8')
    6 print(content)

    request请求

     1 import requests
     2 url="***"
     3 headers = {'Accept': '*/*',
     4                'Accept-Language': 'en-US,en;q=0.8',
     5                'Cache-Control': 'max-age=0',
     6                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
     7                'Connection': 'keep-alive',
     8                'Referer': 'http://www.baidu.com/'
     9                }
    10 res = requests.get (url,headers=headers)#加headers头是为了伪装成浏览器浏览网页
    11 print(res.status_code)#打印状态码
    12 print(res.text)#打印文本
    13 print(res.content)#打印图片或者视频文本都可以

    解析库BeautifulSoup

     1 import requests
     2 from bs4 import BeautifulSoup
     3 url="http://news.qq.com/"
     4 headers = {'Accept': '*/*',
     5                'Accept-Language': 'en-US,en;q=0.8',
     6                'Cache-Control': 'max-age=0',
     7                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
     8                'Connection': 'keep-alive',
     9                'Referer': 'http://www.baidu.com/'
    10                }
    11 res = requests.get (url,headers=headers)#加headers头是为了伪装成浏览器浏览网页
    12 Soup = BeautifulSoup(res.text.encode("utf-8"),'lxml')
    13 en = Soup.find_all('en',attrs={'class':'f14 124'})
    14 for i in en:
    15     title = i.a.get_text()
    16     link = i.a['href']
    17     print({
    18         '标题':title,
    19         '链接':link
    20     })

    解析库lxml,xpath表达法

     1 import requests
     2 from lxml import etree
     3 url="http://news.qq.com/"
     4 headers = {'Accept': '*/*',
     5                'Accept-Language': 'en-US,en;q=0.8',
     6                'Cache-Control': 'max-age=0',
     7                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
     8                'Connection': 'keep-alive',
     9                'Referer': 'http://www.baidu.com/'
    10                }
    11 html = requests.get (url,headers=headers)#加headers头是为了伪装成浏览器浏览网页
    12 con  = etree.HTML(html.text)
    13 title = con.xpath('//en[@class="f14 124"]/a/text()')
    14 link = con.xpath('//en[@class="f14 124"]/a/@href')
    15 for i in zip(title,link):
    16     print({
    17         '标题':i[0],
    18         '链接':i[1]
    19     })

    selesct方法

     1 import requests
     2 from bs4 import BeautifulSoup
     3 url="http://news.qq.com/"
     4 headers = {'Accept': '*/*',
     5                'Accept-Language': 'en-US,en;q=0.8',
     6                'Cache-Control': 'max-age=0',
     7                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
     8                'Connection': 'keep-alive',
     9                'Referer': 'http://www.baidu.com/'
    10                }
    11 res = requests.get (url,headers=headers)#加headers头是为了伪装成浏览器浏览网页
    12 Soup = BeautifulSoup(res.text.encode("utf-8"),'lxml')
    13 en = Soup.select('en[class="f12 124"] a')
    14 for i in en:
    15     title = i.a.get_text()
    16     link = i.a['href']
    17     print({
    18         '标题':i[0],
    19         '链接':i[1]
    20     })
  • 相关阅读:
    函数对象与闭包
    20.03.19作业
    关键字参数,名称空间和作用域
    作业03.18
    函数第二天
    20.03.17作业
    文件
    20.01.16作业
    前端基础
    前端知识(二)
  • 原文地址:https://www.cnblogs.com/ZHANG576433951/p/11152616.html
Copyright © 2011-2022 走看看