1、从酷狗网站爬取 新歌首发的新歌名字、播放时长、链接等
from bs4 import BeautifulSoup as BS import requests import re import json class StockCrawler(): def __init__(self): pass def get_stockinfo(self,url): res=requests.get(url) res.encoding='utf-8' soup=BS(res.text,'html.parser') stock_info=[] div=soup.find_all('div',id='SongtabContent') #定位最外层的新歌区域标签 li=div[0].find_all('li') for i in li: #遍历每首歌的标签,分别取歌名、歌播放时长、链接 print(i) link='http:'+i.find_all('a')[0]['href'] print('link:',link) songname=i.a.find_all('span',class_='songName')[0].text songtime=i.a.find_all('span',class_='songTime')[0].text print('songname:',songname) print('songtime:',songtime) stock_info.append((songname,songtime,link)) return stock_info def write_file(self,data,file_name): with open(file_name,'w',encoding='utf-8') as fp: for i in data: fp.write(i[0]+','+i[1]+','+i[2]+' ') #写入文件 if __name__=='__main__': url='http://www.kugou.com/' stockcrawler=StockCrawler() data=stockcrawler.get_stockinfo(url) stockcrawler.write_file(data,'f:\test\pppp.txt')
2、sohu网站首页 爬取包含"体育"2个字的链接
r = requests.get("http://www.sohu.com") r.encoding="utf-8" html = r.text soup = BeautifulSoup(html,"html.parser")#html可以是html内容 links = [] for i in list(soup.find_all(["a"])): try: print(i["href"]) if i["href"].strip().startswith(r"//"): print(1) i["href"]="http:"+i["href"] if i["href"].find("javascript")>=0: print(2) continue if i["href"].find("mailto")>=0: print(3) continue if len(i["href"].strip())==1: print(4) continue #print(i["href"]) links.append(i["href"].strip()) #print("*"*50) except Exception as e: print(e) for link in links: print(link) x=1 for link in links: r = requests.get(link) r.encoding = "utf-8" if "体育" in r.text: with open("e:\pic\"+str(x)+".txt","w",encoding="utf-8") as fp: fp.write(r.text) x+=1
3、使用代理服务器 发送请求
proxy='168.0.86.146:8080' #如果代理需要验证,只需要在前面加上用户名密码,如下所示 # proxy='username:password@124.243.226.18:8888' proxies={ 'http':'http://'+proxy, 'https':'https://'+proxy, } try: response=requests.get('http://httpbin.org/get',proxies=proxies) print(response.text) except requests.exceptions.ConnectionError as e: print("Error",e.args)
4、Srapy 爬虫框架
#Scrapy 爬虫框架 ''' scrapy startproject testman 1)items.py 存储你要爬取的数据的变量。 类似于字典。 2)pipelines.py(保存爬取后的数据):保存你抓取网页,分析后的存储的 变量中的数据存入到某个地方。(json文件)txt文件、excel 、数据库。 3)settings.py:设定spider的优先级,自动生成的,取消掉数据就可以了。 ITEM_PIPELINES = {'gr.pipelines.GrPipeline': 300,} gr:抓取器的名字 4)在spider的目录下,写一下分析逻辑(从网页中取想要的数据,保存到items.py声明的变量中。) 框架使用的步骤: 1 新建scrapy工程,在任意目录下,cmd中执行:scrapy startproject groad 2 生成工程的内容,在scrapy的根目录下cmd中 执行:scrapy genspider newsong www.kugou.com 3 编写相关代码 4 在scrapy的根目录下,cmd执行抓取:scrapy crawl newsong '''
5、Scrapy框架的例子,酷狗爬取新歌首发的歌曲名称、播放时长、歌曲链接等
#items.py文件内容 import scrapy class GroadItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #定义项目的字段 songname=scrapy.Field() #歌曲名称 songtime=scrapy.Field()#歌曲播放时间 href_song=scrapy.Field()#歌曲播放链接 if __name__=='__main__': g=GroadItem() print(g['songname']) #pipelines.py 文件内容 import json class GroadPipeline(object): def __init__(self): self.filename=open('f:\test\newsong.txt','w',encoding='utf-8') def process_item(self, item, spider): text=json.dumps(dict(item),ensure_ascii=False)+' ' self.filename.write(text) return item def close_spider(self,spider): self.filename.close() #settings.py文件,放开注释 ITEM_PIPELINES = { 'groad.pipelines.GroadPipeline': 300, } #newsong.py 文件 import scrapy from groad.items import GroadItem class NewsongSpider(scrapy.Spider): name = 'newsong' allowed_domains = ['www.kugou.com'] start_urls = ['http://www.kugou.com/'] def parse(self, response): item=GroadItem() for i in range(1,len(response.xpath('//*[@id="SongtabContent"]/ul'))+1): for j in range(1,len(response.xpath('//*[@id="SongtabContent"]/ul[%s]/li' % i))+1): item['songname']=response.xpath('//*[@id="SongtabContent"]/ul[%s]/li[%s]/a/span[1]/text()' % (i,j)).extract()[0] item['songtime'] =response.xpath('//*[@id="SongtabContent"]/ul[%s]/li[%s]/a/span[@class="songTime"]/text()' % (i, j)).extract()[0] item['href_song'] = response.xpath('//*[@id="SongtabContent"]/ul[%s]/li[%s]/a/@href' % (i, j)).extract()[0] yield item