zoukankan      html  css  js  c++  java
  • java利用爬虫技术抓取(省、市(区号邮编)、县)数据

    近期项目须要用到 城市的地址信息,但从网上下载的xml数据没有几个是最新的地址信息.....数据太老,导致有些地区不全。所以才想到天气预报官网特定有最新最全的数据。贴出代码,希望能给有相同困惑的朋友。降低一些时间。

    	/**
    	 * @param var  城市名称
    	 * @return	string数组。0表示邮编	1表示区号
    	 */
    	@SuppressWarnings("deprecation")
    	private String[] getZipCode(String var) {
    		String[] code = new String[2];
    		String zipCode_S = "邮编:";
    		String zipCode_E = " ";
    		String qhCode_S = "区号:";
    		String qhCode_E = "</td>";
    		String encode = URLEncoder.encode(var);
    		try {
    			URL url = new URL("http://www.ip138.com/post/search.asp?

    area=" + encode + "&action=area2zone"); BufferedReader br = new BufferedReader(new InputStreamReader( url.openStream(), "GBK")); for (String line; (line = br.readLine()) != null;) { int zipNum = line.indexOf(zipCode_S); if (zipNum > 1) { String str = line.substring(zipNum + zipCode_S.length()); str = str.substring(0, str.indexOf(zipCode_E)); code[0] = str; } int qhNum = line.indexOf(qhCode_S); if(qhNum > 1) { String str = line.substring(qhNum + qhCode_S.length()); str = str.substring(0, str.indexOf(qhCode_E)); code[1] = str; break; } } } catch (Exception e) { System.out.println(var +" 错误"+e.toString()); } return code; } /** * 主程序 * @throws Exception */ @Test public void main() throws Exception { //1:获取全部省份 TreeMap<String,String> provincesBuffer = getAddressInfo("http://www.weather.com.cn//data/city3jdata/china.html"); Element prcEle = DocumentHelper.createElement("Provinces"); //2:依据省份获取城市 Element citysEle = DocumentHelper.createElement("Citys"); //3:依据省份城市获取区、县 Element distEle = DocumentHelper.createElement("Districts"); int p = 1; int c = 1; int d = 1; for(Entry<String, String> prc : provincesBuffer.entrySet()) { Element province = DocumentHelper.createElement("Province"); province.addAttribute("ID",""+(p)).addAttribute("ProvinceName", prc.getValue()).addText(prc.getValue()); //获取邮政编号 TreeMap<String,String> cityBuffer = getAddressInfo("http://www.weather.com.cn/data/city3jdata/provshi/"+prc.getKey()+".html"); for(Entry<String, String> citys : cityBuffer.entrySet()) { Element city = DocumentHelper.createElement("City"); String[] zipCode = getZipCode(citys.getValue()); if(zipCode[0]==null||zipCode[1]==null) System.out.println("缺少"+citys.getValue()+"邮政或区号!"); city.addAttribute("ID", ""+c).addAttribute("CityName", citys.getValue()).addAttribute("PID",p+"").addAttribute("ZipCode", zipCode[0]).addAttribute("AreaCode", zipCode[1]).addText(citys.getValue()); TreeMap<String, String> distsBuffer = getAddressInfo("http://www.weather.com.cn/data/city3jdata/station/"+prc.getKey()+""+citys.getKey()+".html"); for(Entry<String, String> dists : distsBuffer.entrySet()) { String value = dists.getValue(); if(value.equals(citys.getValue())) continue; Element district = DocumentHelper.createElement("District"); district.addAttribute("ID",""+(d++)).addAttribute("DistrictName", dists.getValue()).addAttribute("CID", c+"").addText(dists.getValue()); distEle.add(district); } citysEle.add(city); c++; } prcEle.add(province); p++; } //4:保存到本地 saveInf("f:\Provinces.xml",prcEle); saveInf("f:\Citys.xml",citysEle); saveInf("f:\Districts.xml",distEle); } /** 保存xml * @param savePath xml保存路径 * @param varEle 根元素 */ private void saveInf(String savePath, Element varEle) { Document varDoc = DocumentHelper.createDocument(); varDoc.add(varEle); try { XMLWriter xmlwri = new XMLWriter(new FileOutputStream(new File(savePath)), new OutputFormat(" ", true, "UTF-8")); xmlwri.write(varDoc); xmlwri.close(); } catch (Exception e) { System.out.println(savePath +"失败,原因例如以下"); throw new RuntimeException(e); } } /** * 获取信息 * @param address url路径 * @return key :信息编号 value:信息名称 */ private TreeMap<String, String> getAddressInfo(String address) { TreeMap<String,String> china = new TreeMap<String, String>(); BufferedReader br = null; String buffer = null; try { URL url = new URL(address); br = new BufferedReader(new InputStreamReader(url.openStream(),"UTF-8")); buffer = br.readLine(); } catch (Exception e) { System.out.println("错误:"+e.getMessage()); }finally{ if(br != null) try { br.close(); } catch (IOException e) { e.printStackTrace(); } } if(buffer==null) return china; buffer = buffer.replaceAll("\{|\}|"",""); String[] splits = buffer.split(","); for(String sp : splits) { String[] split = sp.split(":"); if(split!=null && split.length == 2) china.put(split[0], split[1]); else System.out.println(address); } buffer = null; return china; }


    下载xml数据

  • 相关阅读:
    ImportError: libXext.so.6: cannot open shared object file: No such file or directory
    Django项目添加日志
    Django项目DEBUG=False时配置静态文件
    Django项目DEBUG=False时配置静态文件
    真的佩服python强大表达力
    mycharm环境建立django项目并增删改查
    Apache配置https
    安卓签名
    Android studion不能启动问题
    带你入门函数式编程
  • 原文地址:https://www.cnblogs.com/wzjhoutai/p/6801395.html
Copyright © 2011-2022 走看看