zoukankan      html  css  js  c++  java
  • [Java] 数据分析--数据预处理

    数据结构

    • 键-值对:HashMap
     1 import java.io.File;
     2 import java.io.FileNotFoundException;
     3 import java.util.HashMap;
     4 import java.util.Scanner;
     5 
     6 public class HashMapExample {
     7     public static void main(String[] args) {
     8         File dataFile = new File("data/Countries.dat");
     9         HashMap<String,Integer> dataset = new HashMap();
    10         try {
    11             Scanner input = new Scanner(dataFile);
    12             while (input.hasNext()) {
    13                 String country = input.next();
    14                 int population = input.nextInt();
    15                 dataset.put(country, population);
    16             }
    17         } catch (FileNotFoundException e) {
    18             System.out.println(e);
    19         }
    20         System.out.printf("dataset.size(): %d%n", dataset.size());
    21         System.out.printf("dataset.get("Peru"): %,d%n", dataset.get("Peru"));
    22     }
    23 }
    View Code

    文件处理

    • csv文件
      • 将Map数据存入csv文件  
     1 import java.io.File;
     2 import java.io.FileNotFoundException;
     3 import java.io.FileOutputStream;
     4 import java.io.IOException;
     5 import java.util.Map;
     6 import java.util.Scanner;
     7 import java.util.Set;
     8 import java.util.TreeMap;
     9 import org.apache.poi.hssf.usermodel.HSSFRow;
    10 import org.apache.poi.hssf.usermodel.HSSFSheet;
    11 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
    12 
    13 public class FromMapToExcel {
    14     public static void main(String[] args) {
    15         Map<String,Integer> map = new TreeMap();
    16         load(map, "data/Countries.dat");
    17         print(map);
    18         storeXL(map, "data/Countries.xls", "Countries Worksheet");
    19     }
    20     
    21     /** Loads the data from the specified file into the specified map.
    22     */
    23     public static void load(Map map, String fileSpec) {
    24         File file = new File(fileSpec);
    25         try {
    26             Scanner input = new Scanner(file);
    27             while (input.hasNext()) {
    28                 String country = input.next();
    29                 int population = input.nextInt();
    30                 map.put(country, population);
    31             }
    32         } catch (FileNotFoundException e) {
    33             System.out.println(e);
    34         }
    35     }
    36     
    37     public static void print(Map map) {
    38         Set countries = map.keySet();
    39         for (Object country : countries) {
    40             Object population = map.get(country);
    41             System.out.printf("%-10s%,12d%n", country, population);
    42         }
    43     }
    44     
    45     /** Stores the specified map in the specified worksheet of 
    46         the specified Excel workbook file.
    47      * @param map
    48      * @param fileSpec
    49      * @param sheet
    50     */
    51     public static void storeXL(Map map, String fileSpec, String sheet) {
    52         try {
    53             FileOutputStream out = new FileOutputStream(fileSpec);
    54             HSSFWorkbook workbook = new HSSFWorkbook();
    55             HSSFSheet worksheet = workbook.createSheet(sheet);
    56             Set countries = map.keySet();
    57             short rowNum = 0;
    58             for (Object country : countries) {
    59                 Object population = map.get(country);
    60                 HSSFRow row = worksheet.createRow(rowNum);
    61                 row.createCell(0).setCellValue((String)country);
    62                 row.createCell(1).setCellValue((Integer)population);
    63                 ++rowNum;
    64             }
    65             workbook.write(out);
    66             out.flush();
    67             out.close();
    68         } catch (FileNotFoundException e) {
    69             System.err.println(e);
    70         } catch (IOException e) {
    71             System.err.println(e);
    72         }
    73     }
    74 }
    View Code
      • 读取csv文件
     1 import java.io.File;
     2 import java.io.FileNotFoundException;
     3 import java.util.HashMap;
     4 import java.util.Scanner;
     5 
     6 public class ReadingCSVFiles {
     7     public static void main(String[] args) {
     8         File dataFile = new File("data/Countries.csv");
     9         try {
    10             Scanner input = new Scanner(dataFile);
    11             input.useDelimiter(",|\s");
    12             String column1 = input.next();
    13             String column2 = input.next();
    14             System.out.printf("%-10s%12s%n", column1, column2);
    15             while (input.hasNext()) {
    16                 String country = input.next();
    17                 int population = input.nextInt();
    18                 System.out.printf("%-10s%,12d%n", country, population);
    19             }
    20         } catch (FileNotFoundException e) {
    21             System.out.println(e);
    22         }
    23     }
    24 }
    View Code
      • 读取csv到Map
     1 import static dawj.ch02.FromMapToExcel.print;
     2 import java.io.FileInputStream;
     3 import java.io.FileNotFoundException;
     4 import java.io.IOException;
     5 import java.util.Map;
     6 import java.util.TreeMap;
     7 import org.apache.poi.hssf.usermodel.HSSFCell;
     8 import org.apache.poi.hssf.usermodel.HSSFRow;
     9 import org.apache.poi.hssf.usermodel.HSSFSheet;
    10 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
    11 import org.apache.poi.ss.usermodel.DataFormatter;
    12 import org.apache.poi.ss.usermodel.Row;
    13 
    14 public class FromExcelToMap {
    15     public static void main(String[] args) {
    16         Map map = loadXL("data/Countries.xls", "Countries Worksheet");
    17         print(map);
    18     }
    19     
    20     /** Returns a Map object containing the data from the specified 
    21         worksheet in the specified Excel file.
    22     */
    23     public static Map loadXL(String fileSpec, String sheetName) {
    24         Map<String,Integer> map = new TreeMap();
    25         try {
    26             FileInputStream stream = new FileInputStream(fileSpec);
    27             HSSFWorkbook workbook = new HSSFWorkbook(stream);
    28             HSSFSheet worksheet = workbook.getSheet(sheetName);
    29             DataFormatter formatter = new DataFormatter();
    30             for (Row row : worksheet) {
    31                 HSSFRow hssfRow = (HSSFRow)row;
    32                 HSSFCell cell = hssfRow.getCell(0);
    33                 String country = cell.getStringCellValue();
    34                 cell = hssfRow.getCell(1);
    35                 String str = formatter.formatCellValue(cell);
    36                 int population = (int)Integer.getInteger(str);
    37                 map.put(country, population);
    38             }
    39         } catch (FileNotFoundException e) {
    40             System.err.println(e);
    41         } catch (IOException e) {
    42             System.err.println(e);
    43         }
    44         return map;
    45     }
    46 }
    View Code
    • 解析JSON文件
     1 import java.io.File;
     2 import java.io.FileInputStream;
     3 import java.io.FileNotFoundException;
     4 import java.io.IOException;
     5 import java.io.InputStream;
     6 import java.util.ArrayList;
     7 import java.util.HashMap;
     8 import javax.json.Json;
     9 import javax.json.stream.JsonParser;
    10 import javax.json.stream.JsonParser.Event;
    11 
    12 public class ParsingJSONFiles {
    13     public static void main(String[] args) {
    14         File dataFile = new File("data/Books.json");
    15         try {
    16             InputStream stream = new FileInputStream(dataFile);
    17             JsonParser parser = Json.createParser(stream);
    18             Event event = parser.next();  // advance past START_OBJECT
    19             HashMap<String,Object> map = getMap(parser);
    20             System.out.println(map);
    21             stream.close();
    22         } catch (FileNotFoundException e) {
    23             System.out.println(e);
    24         } catch (IOException e) {
    25             System.out.println(e);
    26         }
    27     }
    28     
    29     /*  Returns the HashMap parsed by the specified parser.
    30         Called when event.equals(event.START_OBJECT):
    31     */
    32     public static HashMap getMap(JsonParser parser) {
    33         HashMap<String,Object> map = new HashMap();
    34         Event event = parser.next();  // advance past START_OBJECT
    35         String key = parser.getString();
    36         event = parser.next();       // advance past KEY_NAME
    37         while (!event.equals(Event.END_OBJECT)) {
    38             if (event.equals(Event.VALUE_STRING)) {
    39                 String value = parser.getString();
    40                 map.put(key, value);
    41             } else if (event.equals(Event.VALUE_NUMBER)) {
    42                 Integer value = parser.getInt();
    43                 map.put(key, value);
    44             } else if (event.equals(Event.START_ARRAY)) {
    45                 ArrayList<String> list = getList(parser);
    46                 map.put(key, list);
    47             }
    48             event = parser.next();
    49             if (event.equals(Event.END_OBJECT)) {
    50                 break;
    51             }
    52             key = parser.getString();
    53             event = parser.next();
    54         }
    55         return map;
    56     }
    57     
    58     /*  Returns the ArrayList parsed by the specified parser.
    59         Called when event.equals(event.START_ARRAY):
    60     */
    61     public static ArrayList getList(JsonParser parser) {
    62         ArrayList list = new ArrayList();
    63         Event event = parser.next();  // advance past START_ARRAY
    64         while (!event.equals(Event.END_ARRAY)) {
    65             if (event.equals(Event.VALUE_STRING)) {
    66                 list.add(parser.getString());
    67                 event = parser.next();
    68             } else if (event.equals(Event.START_OBJECT)) {
    69                 HashMap<String,Object> map = getMap(parser);
    70                 list.add(map);
    71                 event = parser.next();
    72             } else if (event.equals(Event.START_ARRAY)) {
    73                 ArrayList subList = getList(parser);   //  recursion
    74                 list.add(subList);
    75                 event = parser.next();
    76             }
    77         }
    78         return list;
    79     }
    80 }
    View Code

    数据处理

    • 生成测试数据集
     1 import java.io.File;
     2 import java.io.FileNotFoundException;
     3 import java.io.PrintWriter;
     4 import java.util.Random;
     5 
     6 public class GeneratingTestData {
     7     private static final int ROWS = 8, COLS = 5;
     8     private static final Random RANDOM = new Random();
     9     
    10     public static void main(String[] args) {
    11         File outputFile = new File("data/Output.csv");
    12         try {
    13             PrintWriter writer = new PrintWriter(outputFile);
    14             for (int i = 0; i < ROWS; i++) {
    15                 for (int j = 0; j < COLS-1; j++) {
    16                     writer.printf("%.6f,", RANDOM.nextDouble());
    17                 }
    18                 writer.printf("%.6f%n", RANDOM.nextDouble());
    19             }
    20             writer.close();
    21         } catch (FileNotFoundException e) {
    22             System.err.println(e);
    23         }
    24     }
    25 }
    View Code
    • 数据过滤
      • 需求:选择国土面积超过100万平米的内陆国家
      • 过程:数据为dat格式,先定义对应简单类country,再写程序将dat中数据存在country的Set中,最后做筛选

    Country.java

     1 import java.util.HashSet;
     2 import java.util.Scanner;
     3 
     4 class Country {
     5     protected String name;
     6     protected int population;
     7     protected int area;
     8     protected boolean landlocked;
     9 
    10     /*  Constructs a new Country object from the next line being scanned.
    11         If there are no more lines, the new object's fields are left null.
    12     */
    13     public Country(Scanner in) {
    14         if (in.hasNextLine()) {
    15             this.name = in.next();
    16             this.population = in.nextInt();
    17             this.area = in.nextInt();
    18             this.landlocked = in.nextBoolean();
    19         }
    20     }
    21 
    22     @Override
    23     public String toString() {
    24         return String.format("%-10s %,12d %,12d %b", 
    25                 name, population, area, landlocked);
    26     }
    27 }
    View Code

    FilteringData.java

     1 import java.io.File;
     2 import java.io.FileNotFoundException;
     3 import java.util.HashSet;
     4 import java.util.Scanner;
     5 import java.util.Set;
     6 import java.util.TreeMap;
     7 
     8 public class FilteringData {
     9     private static final int MIN_AREA = 1000000;  // one million 
    10     public static void main(String[] args) {
    11         File file = new File("data/Countries.dat");
    12         Set<Country> dataset = readDataset(file);
    13         
    14         for (Country country : dataset) {
    15             if (country.landlocked && country.area >= MIN_AREA) {
    16                 System.out.println(country);
    17             }
    18         }
    19     }
    20     
    21     public static Set readDataset(File file) {
    22         Set<Country> set = new HashSet();
    23         try {
    24             Scanner input = new Scanner(file);
    25             input.nextLine();  // read past headers
    26             while (input.hasNextLine()) {
    27                 set.add(new Country(input));
    28             }
    29             input.close();
    30         } catch (FileNotFoundException e) {
    31             System.out.println(e);
    32         }
    33         return set;
    34     }
    35 }
    View Code
    • 排序
      • 需求:将contries.dat中数据按population进行排序
      • 实现:将数据存入TreeMap
      • 注意:关键字段必须唯一,即两个国家人口不能相同
     1 import java.io.File;
     2 import java.io.FileNotFoundException;
     3 import java.util.Collections;
     4 import java.util.HashMap;
     5 import java.util.Scanner;
     6 import java.util.Set;
     7 import java.util.TreeMap;
     8 
     9 public class SortingData {
    10     public static void main(String[] args) {
    11         File file = new File("src/main/java/com/hongfeng/SortingData/Countries.dat");
    12         TreeMap<Integer,String> dataset = new TreeMap();
    13         try {
    14             Scanner input = new Scanner(file);
    15             while (input.hasNext()) {
    16                 String x = input.next();
    17                 int y = input.nextInt();
    18                 dataset.put(y, x);
    19             }
    20             input.close();
    21         } catch (FileNotFoundException e) {
    22             System.out.println(e);
    23         }
    24         print(dataset);
    25     }
    26     
    27     public static void print(TreeMap<Integer,String> map) {
    28         for (Integer key : map.keySet()) {
    29             System.out.printf("%,12d  %-16s%n", key, map.get(key));
    30         }
    31     }
    32 }
    View Code
    • 合并
      • 需求:将多个排好序的文件合并为单个排好序的文件
      • country类继承Comparable,定义从文件创建对象的构造方法,以及比较方法
      • 扫描两个文件,比较,存入新文件,一个文件扫描完后,另一个文件逐项扫描即可

    Country.java

     1 import java.util.Scanner;
     2 
     3 class Country implements Comparable{
     4     protected String name;
     5     protected int population;
     6 
     7     /*  Constructs a new Country object from the next line being scanned.
     8         If there are no more lines, the new object's fields are left null.
     9     */
    10     public Country(Scanner in) {
    11         if (in.hasNextLine()) {
    12             this.name = in.next();
    13             this.population = in.nextInt();
    14         }
    15     }
    16 
    17     public boolean isNull(){
    18         return this.name == null;
    19     }
    20 
    21     @Override
    22     public int compareTo(Object object){
    23         Country that = (Country)object;
    24         return this.population - that.population;
    25     }
    26 
    27     @Override
    28     public String toString() {
    29         return String.format("%-10s %,12d",
    30                 name, population);
    31     }
    32 }
    View Code

    MergingFiles

     1 import java.io.File;
     2 import java.io.FileNotFoundException;
     3 import java.io.PrintWriter;
     4 import java.util.Scanner;
     5 
     6 public class MergingFiles {
     7     public static void main(String[] args) {
     8         File inFile1 = new File("data/Countries1.dat");
     9         File inFile2 = new File("data/Countries2.dat");
    10         File outFile = new File("data/Countries.dat");
    11         try {
    12             Scanner in1 = new Scanner(inFile1);
    13             Scanner in2 = new Scanner(inFile2);
    14             PrintWriter out = new PrintWriter(outFile);
    15             Country country1 = new Country(in1);
    16             Country country2 = new Country(in2);  
    17             System.out.println(country1.hashCode());
    18             System.out.println(country2.hashCode());
    19             while (!country1.isNull() && !country2.isNull()) {
    20                 if (country1.compareTo(country2) < 0) {
    21                     out.println(country1);
    22                     country1 = new Country(in1);
    23                 } else {
    24                     out.println(country2);
    25                     country2 = new Country(in2);
    26                 }
    27             }
    28             while (!country1.isNull()) {
    29                 out.println(country1);
    30                 country1 = new Country(in1);
    31             }
    32             while (!country2.isNull()) {
    33                 out.println(country2);
    34                 country2 = new Country(in2);
    35             }
    36             in1.close();
    37             in2.close();
    38             out.close();
    39         } catch (FileNotFoundException e) {
    40             System.out.println(e);
    41         }
    42     }
    43 }
    View Code
  • 相关阅读:
    C++ 之头文件依赖和引用类型的成员变量
    go语言学习之结构体
    go语言学习之解析XML
    VSCode编辑器使用技巧:快捷输入HTML代码(转)
    Qt QNetworkAccessManager请求导致的软件闪退
    注册表在64位操作系统下
    Signal和Slot是同步的还是异步的
    C++之private虚函数
    eclipse环境下Python报错"undefined variable from import..."的解决方案
    Android占位符
  • 原文地址:https://www.cnblogs.com/cxc1357/p/14674611.html
Copyright © 2011-2022 走看看