zoukankan html css js c++ java

python爬虫笔记（4-2）bs4模块实例

实例来源：https://www.bilibili.com/video/BV1i54y1h75W?p=1

# 爬取新发地菜价
from bs4 import BeautifulSoup
import requests
import csv

# 拿到菜价页面源代码
url = "http://www.xinfadi.com.cn/marketanalysis/2/list/1.shtml"
resp = requests.get(url)
resp.encoding = "utf-8"
# print(resp.text)

# 建csv文件并写入
f = open("菜价.csv",mode="a+",newline='',encoding="UTF-8") #mode="a+"：可追加入写csv文件；newline=''：去除多余空行
csvwriter = csv.writer(f)

# 使用bs4进行解析
# 1.把页面数据用BeautifulSoup进行处理，生成bs对象
page = BeautifulSoup(resp.text,"html.parser") # html.parser：指定HTML解析器

# 2.从bs对象中查找数据
# find(“标签”，属性=值)：查找一个，找到一个就结束
# find_all(“标签”，属性=值)：查找所有
# 查找表
table = page.find_all("table",class_="hq_table") # class是python的关键字，所以要把class写成class_
table = page.find("table",attrs={"class": "hq_table"}) #意思同上一行，可以避免class
# print(table)
# 拿到所有数据行
trs = table.find_all("tr")[1:]  #找表中所有的行tr，并从第1行开始切片（去掉第0行）
for tr in trs:  #每一行
    tds = tr.find_all("td")  #拿到行中所有的列td
    name = tds[0].text  #.text表示拿到被标签标记的内容
    low = tds[1].text
    avg = tds[2].text
    high = tds[3].text
    spec = tds[4].text
    unit = tds[5].text
    date = tds[6].text
    csvwriter.writerow([name,low,avg,high,spec,unit,date])

f.close()
print("over!!")

# 抓取优美图库图片，存入文件夹
# 1、拿到主页面的源代码，提取到子页面的链接地址（href）
# 2、通过href拿到子页面的内容，找到下载地址 img-->src
# 3、下载图片
import requests
from bs4 import BeautifulSoup
import csv
import time

# 拿到首页源代码
url = "https://www.umei.net/katongdongman/dongmanbizhi/"
domain = "https://www.umei.net/"
resp = requests.get(url)
resp.encoding = "utf-8"
# print(resp.text)

# 使用bs4进行解析
# 1.把页面数据用BeautifulSoup进行处理，生成bs对象，拿到a标签-->拿到href属性值-->domain+href组装成完成的子页面链接
main_page = BeautifulSoup(resp.text,"html.parser") # html.parser：指定HTML解析器
alist = main_page.find("div",class_="TypeList").find_all("a") # 找到所有的a标签
# print(alist)
# 在a标签中拿到href，和domain一起拼接成完成的图片链接
for a in alist:
    # print(a.get("href")) # 直接通过get就可以拿到属性的值
    child_url = domain + a.get("href")  # 直接通过get就可以拿到属性的值
    # print(child_url)

    # 拿到子页面源代码
    child_resp = requests.get(child_url)
    child_resp.encoding = "utf-8"
    # print(child_resp.text)

    # 在子页面源代码中拿到图片下载链接
    # 使用bs4进行解析
    # 1.把页面数据用BeautifulSoup进行处理，生成bs对象，拿到p标签-->拿到img标签-->拿到src属性值
    child_page = BeautifulSoup(child_resp.text,"html.parser")
    p = child_page.find("p",align="center")
    # print(p)
    img = p.find("img")
    src = img.get("src")
    # print(src)

    # 下载图片
    img_resp = requests.get(src)
    # img_resp.content  # 这里拿到的是字节
    img_name = src.split("/")[-1]   #拿到URL最后一个/后的内容做图片名称
    with open("imgg/"+img_name,mode="wb") as f:
        f.write(img_resp.content)  # 图片内容写入文件

    print("over!!",img_name)
    time.sleep(1)  # 取完一张图片后休息1s

print("all over!")

查看全文

相关阅读:
向量杂谈
 对widget使用WM_SetCallback
群延迟与广义线性相位
 采样的频域表示
 Oracle 表的连接方式(2)-----HASH JOIN的基本机制1
Oracle 表的连接方式(1)-----Nested loop join和 Sort merge join
Oracle 表的访问方式(2)-----索引扫描
 Oracle 表的访问方式(1) ---全表扫描、通过ROWID访问表
 11g RAC R2 之Linux DNS 配置
 11g RAC r2 的启停命令概述1

原文地址：https://www.cnblogs.com/testerhappy/p/15141024.html