zoukankan      html  css  js  c++  java
  • 爬取京东评论、分词+词频统计、词云图展示

    一、爬取京东评论

    京东评论竟然全部对外开放

    public class CommentCrawler {
    	final static PoolingHttpClientConnectionManager httpClientConnectionManager = new PoolingHttpClientConnectionManager();
    	final static int MAX_PAGE = 50;
    
    	static HttpClient getClient() {
    		return HttpClients.custom().setConnectionManager(httpClientConnectionManager).build();
    	}
    
    	static String getUrl(String productId, int page) {
    		return String.format(
    				"http://sclub.jd.com/comment/productPageComments.action?productId=%s&score=0&sortType=3&page=%d&pageSize=10",
    				productId, page);
    	}
    
    	static Comment commentFromJson(JSONObject json, String productId) {
    		return new Comment(json.getLongValue("id"), productId, json.getString("score"), json.getString("content"));
    	}
    
    	public static boolean crawlComments(String productId) {
    		try {
    			int maxPage = 1;
    			int nowPage = 0;
    			HttpClient client = getClient();
    			while (nowPage < maxPage) {
    				String url = getUrl(productId, nowPage);
    				HttpGet get = new HttpGet(url);
    				HttpResponse resp = client.execute(get);
    				JSONObject json = JSON.parseObject(EntityUtils.toString(resp.getEntity()));
    				JSONArray comments = json.getJSONArray("comments");
    				if (comments.size() == 0)
    					return false;
    				CommentService ser = new CommentService();
    				for (int i = 0; i < comments.size(); i++) {
    					Comment comment = commentFromJson(comments.getJSONObject(i), productId);
    					ser.insertComment(comment);
    				}
    				if (nowPage == 0) {
    					maxPage = json.getInteger("maxPage");
    					ser.insertProduct(new Product(productId, comments.getJSONObject(0).getString("referenceName")));
    				}
    				nowPage++;
    			}
    			ProductJudger.judge(productId);
    			return true;
    		} catch (Exception e) {
    			e.printStackTrace();
    		}
    		return false;
    	} 
    }
    

    二、结巴分词

    jieba分词原本是Python版的,有人把它改成了Java版,名字也改成了jieba-analysis

    JiebaSegmenter segmenter = new JiebaSegmenter();
    		List<Word> list = segmenter.sentenceProcess(str);
    		for (Word i : list) {
    			String token = i.getToken();
    			if (stopWords.contains(token)) {
    				continue;
    			}
    			Integer cnt = map.get(token);
    			if (cnt == null) {
    				cnt = 0;
    			}
    			map.put(token, new Integer(cnt + 1));
    		}
    

    三、词云图

    用到d3.js,d3-cloud.js这两个库,d3.js是“Data Driven Document”,d3-cloud这个库还是比较难用的,主要是官方实例代码太少了。

    这里给出一个例子:每一个词云图都对应一个字典,这个字典就是“词语:频率”这样的键值对。给定多个字典,每一个字典都要渲染成一个词云图。

    <div id="word-clouds" style="text-align: center"></div>
    <script>
        var wordClouds = <%=request.getAttribute("wordClouds")%>;
        $(document).ready(function () {
            for (var i in wordClouds) {
                var it = wordClouds[i];
                var divId = "product_wordcloud" + it['productId'];
                $("#word-clouds").append("<h3 align='center'>商品" + it['productId'] + "词云图</h3>")
                        .append("<div id='" + divId + "'></div>");
                createWordCloud(transformWordFraquency(it['words']), "#" + divId)
            }
        })
    </script>
    
    

    还需要编写如下JS代码

    var fill = d3.scale.category20();// 20种颜色
    var wordCloudWidth = 800, wordCloudHeight = 400;
    var font_name = "楷体", font_weight = "bold", max_font_size = 50;
    var word_count = 50;// 显示词汇个数
    var word_max_size = 60;// 显示词汇字体最大字号
    var word_min_size = 10;// 显示词汇字体最小字号
    /*
     * 函数名称:transformWordFrequency
     * 参数words:字典类型,形如“{word1:cnt1,word2:cnt2,word3:cnt3}” 返回值:{text,size}数组
     */
    
    function transformWordFraquency(words) {
    	var ar = []
    	for ( var i in words) {
    		ar.push({
    			"text" : i,
    			"size" : words[i]
    		})
    	}
    	// 按照字体的大小从大到小进行排序,只取出现次数较多的前几名
    	ar.sort(function(x, y) {
    		return y['size'] - x['size'];
    	})
    	ar = ar.slice(0, Math.min(word_count, ar.length));
    	for (var i = 0; i < ar.length; i++) {
    		ar[i]['size'] = word_max_size - (word_max_size - word_min_size)
    				/ ar.length * i;
    	}
    	return ar;
    }
    /*
     * wordMap是[{text:"",size:""}]形式的数组 selector是即将渲染到的目标位置
     */
    function createWordCloud(wordMap, selector) {
    	d3.layout.cloud().size(
    			[ wordCloudWidth * 2 - 100, wordCloudHeight * 2 - 100 ]).words(
    			wordMap).font(font_name).fontWeight(font_weight).fontSize(
    			function(d) {
    				return d.size;
    			}).rotate(function() {
    		return 0;
    	}).on("end", function(words) {
    		renderWordCloud(words, selector)
    	}).start();
    }
    /*
     * 像这种风格的函数调用,这些函数的调用顺序不能变
     */
    function renderWordCloud(words, selector) {
    	d3.select(selector).append("svg").attr("width", wordCloudWidth).attr(
    			"height", wordCloudHeight).append("g")
    			.attr(
    					"transform",
    					"translate(" + wordCloudWidth / 2 + "," + wordCloudHeight
    							/ 2 + ")").selectAll("text").data(words).enter()// 进入words,相当于for循环
    			.append("text").style("font-family", font_name).style(
    					"font-weight", font_weight)
    			// .attr("text-anchor", "middle")
    			.style("font-size", function(d) {// 字体大小
    				return d.size + "px";
    			}).style("fill", function(d, i) {// 字体颜色
    				return fill(i);
    			}).attr(
    					"transform",
    					function(d) {
    						return "translate(" + [ d.x, d.y ] + ") rotate("
    								+ d.rotate + ")";
    					}).text(function(d) {
    				return d.text;
    			});
    }
    
  • 相关阅读:
    项目上线前的优化
    vue项目打包上线流程以及遇到的问题
    js放大镜
    vue中mixins(混入)的使用
    vue中的provide和inject (依赖注入)
    Vue Virtual Dom 和 Diff原理
    vue 过滤器
    vue自定义指令的使用场景
    php---前后端分离跨域问题的解决
    PHP---for、while、foreach性能比较
  • 原文地址:https://www.cnblogs.com/weiyinfu/p/7089114.html
Copyright © 2011-2022 走看看