#豆瓣电影 re爬虫
import requests,re,csv
url = "https://movie.douban.com/top250"
headers={
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
resp = requests.get(url,headers=headers)
page_connect = resp.text
#解析数据
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)'
r'</span>.*?<p class="">.*?<br>(?P<year>.*?) .*?<span '
r'class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
r'<span>(?P<num>.*?)人评价</span>',re.S)
#开始匹配
result = obj.finditer(page_connect)
f = open("data.csv",mode="w")
csvwriter = csv.writer(f)
for it in result:
# print(it.group("name"))
# print(it.group("score"))
# print(it.group("num"))
# print(it.group("year").strip())
#srtip去空格
#使用字典
dic = it.groupdict()
dic['year'] = dic['year'].strip()
csvwriter.writerow(dic.values())
f.close()
print("over!")
#电影天堂 re爬虫
import requests,re,csv
url = "https://www.dytt8.net/"
headers={
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
resp = requests.get(url,headers=headers,verify=False)
resp.encoding = 'gb2312' #指定字符集
#匹配ul的ui
obj1 = re.compile(r"最新影片推荐.*?<ul>(?P<ul>.*?)</ul>",re.S)
obj2 = re.compile(r"<a href='(?P<href>.*?)'",re.S)
obj3 = re.compile(r'◎片 名 (?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<download>.*?)">', re.S)
result1 = obj1.finditer(resp.text)
#保存子页面
list = []
for it in result1:
ul = it.group('ul')
#提取子页面链接
result2 = obj2.finditer(ul)
for itt in result2:
#子页面链接
url2 = url + itt.group('href').strip("/")
list.append(url2)
#print(url2)
#提取子页面内容
for href in list:
url2 = requests.get(href, headers=headers, verify=False)
url2.encoding = 'gb2312' # 指定字符集
#print(url2.text)
result3 = obj3.search(url2.text)
print(result3.group("movie"))
print(result3.group("download"))
break
#bs4爬虫
import requests,re,csv
from bs4 import BeautifulSoup
url = "http://www.bjtzh.gov.cn/bjtz/home/jrcj/index.shtml"
headers={
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
resp = requests.get(url,headers=headers)
resp.encoding = 'utf-8' #指定字符集
#写入文件
f = open("菜价.csv",mode="w")
csvwriter = csv.writer(f)
#解析数据
#1.把源代码交给beautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text,"html.parser") #指定html解析
#2.从bs对象查找数据
#find(标签,属性=值)
#find_all(标签,属性=值)
#第一种写法
#div = page.find("div",class_="m-r-main m-textLists")
#class是python的关键字,这里用class_区分,防止报错
#第二种写法
div = page.find("div",attrs={"class":"m-r-main m-textLists"})
#第二种可以避免class
#拿到所有数据行tr
trs = div.find_all("tr")[1:] #[1:]做切片 从第1个开始
for tr in trs: #每行数据
tds = tr.find_all("td") #每行的td
name = tds[0].text
class1 = tds[1].text
high = tds[2].text
avg = tds[3].text
#print(name,class1,high,avg)
csvwriter.writerow([name,class1,high,avg])
f.close()
print("over!")
#彼岸壁纸爬取下载。 提前创建img文件夹或者修改脚本
import requests,re,csv,time
from bs4 import BeautifulSoup
url = "https://pic.netbian.com/4kmeinv/"
url1 = "https://pic.netbian.com"
headers={
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
resp = requests.get(url,headers=headers)
resp.encoding = 'gbk' #指定字符集
#print(resp.text)
#解析数据
page = BeautifulSoup(resp.text,"html.parser") #指定html解析
#2.从bs对象查找数据
#find(标签,属性=值)
#find_all(标签,属性=值)
div = page.find("div",class_="slist").find_all("a")
#print(div)
for a in div:
href = url1+(a.get('href'))
#print(href)
#获取子页面源码
resp2 = requests.get(href, headers=headers)
resp2.encoding = 'gbk' # 指定字符集
page2 = BeautifulSoup(resp2.text, "html.parser")
div2 = page2.find("div",class_="photo-pic")
img = div2.find("img")
src = url1+(img.get("src"))
#print(src)
#下载图片
img_resp = requests.get(src)
#img_resp.content #获取字节
img_name = src.split("/")[-1]
# 获取最后/的内容,举例https://pic.netbian.com/uploads/allimg/210831/102129-163037648996ad.jpg
# 从中获取 102129-163037648996ad.jpg
with open("img/"+img_name,mode="wb") as f: #放入img文件夹
f.write(img_resp.content) #图片内容写入文件
f.close()
print(img_name +" is Download OK")
time.sleep(0.5)
print("OVER")
#线程池+xpath提取
import requests,re,csv,lxml
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
headers={
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
f = open("1.csv",mode="w",encoding="UTF-8")
csvwriter = csv.writer(f)
def page1(url):
resp = requests.get(url,headers=headers)
#resp.encoding = "UTF-8" # 指定字符集
#print(resp.text)
html = etree.HTML(resp.text)
table = html.xpath("/html/body/div[4]/div[3]/div[3]/table[2]/tbody")[0]
#print(table)
trs = table.xpath("./tr")
#截取tr
for tr in trs:
txt = tr.xpath("./td/text()")
#print(txt)
#对数据做简单的处理
txt = (item.replace("xa0","") for item in txt)
#print(list(txt))
#存放数据
csvwriter.writerow(txt)
print(url+"提取完成")
if __name__ == '__main__':
#page1("http://www.maicainan.com/offer/show/classid/14/id/4652.html")
#创建线程池
with ThreadPoolExecutor(50) as t: #500个线程
for i in range(11,99): #200个任务
#任务提交到线程池
t.submit(page1,f"http://www.maicainan.com/offer/show/classid/14/id/46{i}.html")
print("全部提取完成")