zoukankan      html  css  js  c++  java
  • BeautifulSoup 抓取数据


    抓取天气

    import requests
    from bs4 import BeautifulSoup
    
    # from pyecharts import Bar
    
    cities_temp = []
    
    
    # 处理抓取页面
    def parse_url(url):
        headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36(KHTML, like Gecko) Chrome/72.0.3626.121 Mobile Safari/537.36"}
        resp = requests.get(url, headers=headers)
        text = resp.content.decode("utf-8")
        soup = BeautifulSoup(text, "lxml")
        conMidtab = soup.find("div", attrs={"class":"conMidtab"})
        tables = conMidtab.find_all("table")
        for table in tables:
            trs = table.find_all("tr")[2:]
            for index, tr in enumerate(trs):
                cities = {}
                tds = tr.find_all("td")
                city_td = tds[0]
                if index == 0:
                    city_td = tds[1]
                city = list(city_td.stripped_strings)[0]
                temp_td = tds[-2]
                min_temp = list(temp_td.stripped_strings)[0]
                cities["城市"] = city
                cities["最低温度"] = str(min_temp)
                cities_temp.append(cities)
    
    
    def main():
        urls = [
            'http://www.weather.com.cn/textFC/hb.shtml',
            'http://www.weather.com.cn/textFC/db.shtml',
            'http://www.weather.com.cn/textFC/hd.shtml',
            'http://www.weather.com.cn/textFC/hz.shtml',
            'http://www.weather.com.cn/textFC/hn.shtml',
            'http://www.weather.com.cn/textFC/xb.shtml',
            'http://www.weather.com.cn/textFC/xn.shtml',
            'http://www.weather.com.cn/textFC/gat.shtml'
        ]
        for url in urls:
            parse_url(url)
    
        # 分析数据排序
        cities_temp.sort(key=lambda data: data['最低温度'])
        data = cities_temp[0:10]
        for d in data:
            for k, v in d.items():
                print(k + ": " + str(v))
            print("*" * 30)
    
    
    if __name__ == '__main__':
        main()

     抓取京东的数据

    import requests
    from bs4 import BeautifulSoup
    
    
    class spiders:
        def __init__(self, page):
            self.url = 'https://search.jd.com/Search?keyword=%E8%A3%A4%E5%AD%90&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=5&wq=%E8%A3%A4%E5%AD%90&page={0}'.format(page)
            self.headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
            self.search_urls = 'https://search.jd.com/s_new.php?keyword=%E8%A3%A4%E5%AD%90&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=3&wq=%E8%A3%A4%E5%AD%90&page={0}&s=26&scrolling=y&pos=30&show_items={1}'
            self.pids = set()  # 页面中所有的id,用来拼接剩下的30张图片的url,使用集合可以有效的去重
            self.img_urls = set()  # 得到的所有图片的url
    
        # 得到每一页的网页源码
        def get_html(self):
            res = requests.get(self.url, headers=self.headers)
            html = res.text
            return html
    
        # 得到每一个页面的id
        def get_pids(self):
            html = self.get_html()
            soup = BeautifulSoup(html, 'lxml')
            lis = soup.find_all("li", class_='gl-item')
            for li in lis:
                data_pid = li.get("data-sku")
                if (data_pid):
                    self.pids.add(data_pid)
    
        # 得到每一个页面的图片和一些数据,由于这是aiax加载的,因此前面一段的img属性是src,后面的属性是data-lazy-img
        def get_src_imgs_data(self):
            html = self.get_html()
            soup = BeautifulSoup(html, 'lxml')
            divs = soup.find_all("div", class_='p-img')  # 图片
            for div in divs:
                img_1 = div.find("img").get('data-lazy-img')  # 得到没有加载出来的url
                img_2 = div.find("img").get("src")  # 得到已经加载出来的url
                if img_1:
                    self.img_urls.add(img_1)
                if img_2:
                    self.img_urls.add(img_2)
    
        def main(self):
            self.get_pids()
            self.get_src_imgs_data()
            urls = self.img_urls
            for url in urls:
                print('https:' + url)
    
    
    if __name__ == '__main__':
        for i in range(1, 101):
            page = i * 2 - 1  # 这里每一页对应的都是奇数,但是ajax的请求都是偶数的,所有在获取扩展的网页时都要用page+1转换成偶数
            spiders(page).main()
    故乡明
  • 相关阅读:
    Docker容器启动时初始化Mysql数据库
    使用Buildpacks高效构建Docker镜像
    Mybatis 强大的结果集映射器resultMap
    Java 集合排序策略接口 Comparator
    Spring MVC 函数式编程进阶
    换一种方式编写 Spring MVC 接口
    【asp.net core 系列】6 实战之 一个项目的完整结构
    【asp.net core 系列】5 布局页和静态资源
    【asp.net core 系列】4. 更高更强的路由
    【Java Spring Cloud 实战之路】- 使用Nacos和网关中心的创建
  • 原文地址:https://www.cnblogs.com/luweiweicode/p/14335698.html
Copyright © 2011-2022 走看看