zoukankan      html  css  js  c++  java
  • BeautifulSoup 抓取数据


    抓取天气

    import requests
    from bs4 import BeautifulSoup
    
    # from pyecharts import Bar
    
    cities_temp = []
    
    
    # 处理抓取页面
    def parse_url(url):
        headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36(KHTML, like Gecko) Chrome/72.0.3626.121 Mobile Safari/537.36"}
        resp = requests.get(url, headers=headers)
        text = resp.content.decode("utf-8")
        soup = BeautifulSoup(text, "lxml")
        conMidtab = soup.find("div", attrs={"class":"conMidtab"})
        tables = conMidtab.find_all("table")
        for table in tables:
            trs = table.find_all("tr")[2:]
            for index, tr in enumerate(trs):
                cities = {}
                tds = tr.find_all("td")
                city_td = tds[0]
                if index == 0:
                    city_td = tds[1]
                city = list(city_td.stripped_strings)[0]
                temp_td = tds[-2]
                min_temp = list(temp_td.stripped_strings)[0]
                cities["城市"] = city
                cities["最低温度"] = str(min_temp)
                cities_temp.append(cities)
    
    
    def main():
        urls = [
            'http://www.weather.com.cn/textFC/hb.shtml',
            'http://www.weather.com.cn/textFC/db.shtml',
            'http://www.weather.com.cn/textFC/hd.shtml',
            'http://www.weather.com.cn/textFC/hz.shtml',
            'http://www.weather.com.cn/textFC/hn.shtml',
            'http://www.weather.com.cn/textFC/xb.shtml',
            'http://www.weather.com.cn/textFC/xn.shtml',
            'http://www.weather.com.cn/textFC/gat.shtml'
        ]
        for url in urls:
            parse_url(url)
    
        # 分析数据排序
        cities_temp.sort(key=lambda data: data['最低温度'])
        data = cities_temp[0:10]
        for d in data:
            for k, v in d.items():
                print(k + ": " + str(v))
            print("*" * 30)
    
    
    if __name__ == '__main__':
        main()

     抓取京东的数据

    import requests
    from bs4 import BeautifulSoup
    
    
    class spiders:
        def __init__(self, page):
            self.url = 'https://search.jd.com/Search?keyword=%E8%A3%A4%E5%AD%90&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=5&wq=%E8%A3%A4%E5%AD%90&page={0}'.format(page)
            self.headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
            self.search_urls = 'https://search.jd.com/s_new.php?keyword=%E8%A3%A4%E5%AD%90&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&offset=3&wq=%E8%A3%A4%E5%AD%90&page={0}&s=26&scrolling=y&pos=30&show_items={1}'
            self.pids = set()  # 页面中所有的id,用来拼接剩下的30张图片的url,使用集合可以有效的去重
            self.img_urls = set()  # 得到的所有图片的url
    
        # 得到每一页的网页源码
        def get_html(self):
            res = requests.get(self.url, headers=self.headers)
            html = res.text
            return html
    
        # 得到每一个页面的id
        def get_pids(self):
            html = self.get_html()
            soup = BeautifulSoup(html, 'lxml')
            lis = soup.find_all("li", class_='gl-item')
            for li in lis:
                data_pid = li.get("data-sku")
                if (data_pid):
                    self.pids.add(data_pid)
    
        # 得到每一个页面的图片和一些数据,由于这是aiax加载的,因此前面一段的img属性是src,后面的属性是data-lazy-img
        def get_src_imgs_data(self):
            html = self.get_html()
            soup = BeautifulSoup(html, 'lxml')
            divs = soup.find_all("div", class_='p-img')  # 图片
            for div in divs:
                img_1 = div.find("img").get('data-lazy-img')  # 得到没有加载出来的url
                img_2 = div.find("img").get("src")  # 得到已经加载出来的url
                if img_1:
                    self.img_urls.add(img_1)
                if img_2:
                    self.img_urls.add(img_2)
    
        def main(self):
            self.get_pids()
            self.get_src_imgs_data()
            urls = self.img_urls
            for url in urls:
                print('https:' + url)
    
    
    if __name__ == '__main__':
        for i in range(1, 101):
            page = i * 2 - 1  # 这里每一页对应的都是奇数,但是ajax的请求都是偶数的,所有在获取扩展的网页时都要用page+1转换成偶数
            spiders(page).main()
    故乡明
  • 相关阅读:
    vue-element-admin实战 | 第二篇: 最小改动接入后台实现根据权限动态加载菜单
    设置git同时推送github和gitee远程仓库
    Spring Cloud实战 | 第一篇:Windows搭建Nacos服务
    winfrom 打开文件夹并定位到指定的文件
    winfrom 关于textbox回车事件有换行的问题
    winfrom切换账号功能
    解决winfrom最大化 窗体被任务栏挡住的问题
    winfrom解决控件闪烁
    winfrom防止程序多开
    c# 对象,IntPtr互转
  • 原文地址:https://www.cnblogs.com/luweiweicode/p/14335698.html
Copyright © 2011-2022 走看看