抓取天气
import requests from bs4 import BeautifulSoup # from pyecharts import Bar cities_temp = [] # 处理抓取页面 def parse_url(url): headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36(KHTML, like Gecko) Chrome/72.0.3626.121 Mobile Safari/537.36"} resp = requests.get(url, headers=headers) text = resp.content.decode("utf-8") soup = BeautifulSoup(text, "lxml") conMidtab = soup.find("div", attrs={"class":"conMidtab"}) tables = conMidtab.find_all("table") for table in tables: trs = table.find_all("tr")[2:] for index, tr in enumerate(trs): cities = {} tds = tr.find_all("td") city_td = tds[0] if index == 0: city_td = tds[1] city = list(city_td.stripped_strings)[0] temp_td = tds[-2] min_temp = list(temp_td.stripped_strings)[0] cities["城市"] = city cities["最低温度"] = str(min_temp) cities_temp.append(cities) def main(): urls = [ 'http://www.weather.com.cn/textFC/hb.shtml', 'http://www.weather.com.cn/textFC/db.shtml', 'http://www.weather.com.cn/textFC/hd.shtml', 'http://www.weather.com.cn/textFC/hz.shtml', 'http://www.weather.com.cn/textFC/hn.shtml', 'http://www.weather.com.cn/textFC/xb.shtml', 'http://www.weather.com.cn/textFC/xn.shtml', 'http://www.weather.com.cn/textFC/gat.shtml' ] for url in urls: parse_url(url) # 分析数据排序 cities_temp.sort(key=lambda data: data['最低温度']) data = cities_temp[0:10] for d in data: for k, v in d.items(): print(k + ": " + str(v)) print("*" * 30) if __name__ == '__main__': main()
抓取京东的数据
import requests from bs4 import BeautifulSoup class spiders: def __init__(self, page): self.url = 'https://search.jd.com/Search?keyword=%E8%A3%A4%E5%AD%90&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=5&wq=%E8%A3%A4%E5%AD%90&page={0}'.format(page) self.headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} self.search_urls = 'https://search.jd.com/s_new.php?keyword=%E8%A3%A4%E5%AD%90&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=3&wq=%E8%A3%A4%E5%AD%90&page={0}&s=26&scrolling=y&pos=30&show_items={1}' self.pids = set() # 页面中所有的id,用来拼接剩下的30张图片的url,使用集合可以有效的去重 self.img_urls = set() # 得到的所有图片的url # 得到每一页的网页源码 def get_html(self): res = requests.get(self.url, headers=self.headers) html = res.text return html # 得到每一个页面的id def get_pids(self): html = self.get_html() soup = BeautifulSoup(html, 'lxml') lis = soup.find_all("li", class_='gl-item') for li in lis: data_pid = li.get("data-sku") if (data_pid): self.pids.add(data_pid) # 得到每一个页面的图片和一些数据,由于这是aiax加载的,因此前面一段的img属性是src,后面的属性是data-lazy-img def get_src_imgs_data(self): html = self.get_html() soup = BeautifulSoup(html, 'lxml') divs = soup.find_all("div", class_='p-img') # 图片 for div in divs: img_1 = div.find("img").get('data-lazy-img') # 得到没有加载出来的url img_2 = div.find("img").get("src") # 得到已经加载出来的url if img_1: self.img_urls.add(img_1) if img_2: self.img_urls.add(img_2) def main(self): self.get_pids() self.get_src_imgs_data() urls = self.img_urls for url in urls: print('https:' + url) if __name__ == '__main__': for i in range(1, 101): page = i * 2 - 1 # 这里每一页对应的都是奇数,但是ajax的请求都是偶数的,所有在获取扩展的网页时都要用page+1转换成偶数 spiders(page).main()