zoukankan      html  css  js  c++  java
  • python数据分析-网页信息抓取

    HTML简述:

    import urllib.request;
    from bs4 import BeautifulSoup;
    response=urllib.request.urlopen("file:///C:/PA/6.1/html.html");
    
    html=response.read();
    html
    
    soup=BeautifulSoup(html);
    soup
    
    soup.find("tr");
    soup.find_all("tr")
    View Code

    JSON简述:

    import json;
    import urllib.request;
    from bs4 import BeautifulSoup;
    response=urllib.request.urlopen("file:///C:/PA/6.2/json.json");
    
    
    jsonstring=response.read();
    
    jsonobject=json.loads(jsonstring.decode())
    
    jsonobject["employees"]
    jsonobject["employees"][0]
    jsonobject["employees"][0]["lastName"]
    View Code

    解析网页:

     综合案例分析:

    # -*- coding: utf-8 -*-
    import json;
    import urllib.request;
    
    from pandas import Series;
    from pandas import DataFrame;
    
    from bs4 import BeautifulSoup;
    
    response = urllib.request.urlopen('http://item.jd.com/1185291.html');
    
    html = response.read();
    
    soup = BeautifulSoup(html);
    
    divSoup = soup.find(id="product-detail-2")
    
    data = DataFrame(columns=['Feature', 'Property'])
    
    trs = divSoup.find_all('tr');
    
    for tr in trs :
        tds = tr.find_all('td');
        if len(tds)==2:
            f=tds[0].getText();
            p=tds[1].getText();  
            data = data.append(
                Series(
                    [f, p], 
                    index=['Feature', 'Property']
                ), ignore_index=True
            );
    
    len(data)   
    
    response = urllib.request.urlopen('http://p.3.cn/prices/get?skuid=J_1185291');
    jsonString = response.read();
    
    jsonObject = json.loads(jsonString.decode())
    
    jsonObject[0]['p']
    View Code

    复杂些的:

    # -*- coding: utf-8 -*-
    import json;
    import urllib.request;
    
    from pandas import Series;
    from pandas import DataFrame;
    from pandas import read_csv;
    from bs4 import BeautifulSoup;
    
    pids = read_csv("D:\PA\6.5\pids.csv");
    
    #定义统一的列名,下面可以避免重复输入,直接引用即可
    pColumns = ['PID', 'Price'];
    fColumns = ['PID', 'Feature', 'Property'];
    
    #定义存储价格和属性的数据框
    pData = DataFrame(columns=pColumns);
    fData = DataFrame(columns=fColumns);
    
    #开始遍历我们需要抓取的商品数据
    for pid in pids.values:
        PID = pid[0].astype(str);
    
        pUrl = 'http://p.3.cn/prices/get?skuid=J_' + PID;
        print("开始处理价格数据:" + pUrl);
        
        response = urllib.request.urlopen(pUrl);
        jsonString = response.read();
    
        jsonObject = json.loads(jsonString.decode());
    
        Price = float(jsonObject[0]['p']);
        pData = pData.append(
            Series(
                [PID, Price], 
                index=pColumns
            ), ignore_index=True
        );
        
        fUrl = "http://item.jd.com/" + PID + ".html";
        print("开始处理属性数据:" + fUrl);
        response = urllib.request.urlopen(fUrl);
    
        html = response.read();
        soup = BeautifulSoup(html);
    
        divSoup = soup.find(id="product-detail-2");
        trs = divSoup.find_all('tr');
    
        for tr in trs :
            tds = tr.find_all('td');
            if len(tds)==2:
                Feature = tds[0].getText();
                Property=tds[1].getText();  
                fData = fData.append(
                    Series(
                        [PID, Feature, Property], 
                        index=fColumns
                    ), ignore_index=True
                );
    
    print(pData);
    print(fData);
    View Code
  • 相关阅读:
    Tensorflow io demo (待)
    tf.Dataset
    tf.estimator
    并发队列
    Callable的Future模式
    hadoop之HDFS介绍
    线程池
    并发工具类
    并发编程
    初学hadoop之hadoop集群搭建
  • 原文地址:https://www.cnblogs.com/qiuyuyu/p/9145884.html
Copyright © 2011-2022 走看看