zoukankan      html  css  js  c++  java
  • 爬取网页数据基础

    代码如下:

    package com.tracker.offline.tools;
    
    import com.alibaba.fastjson.JSONObject;
    import com.google.common.collect.Lists;
    import com.tracker.common.utils.StringUtil;
    import com.tracker.coprocessor.utils.JsonUtil;
    import org.apache.commons.lang.StringUtils;
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    
    import java.io.BufferedReader;
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileReader;
    import java.io.FileWriter;
    import java.io.IOException;
    import java.util.List;
    import java.util.Map;
    
    /**
     * 文件名:爬取页面上的数据
     */
    public class SpiderUserPosTag {
    
        private static List<Integer> idList = Lists.newArrayList(113717565,113856580);
    
        private static final String url="http://192.168.202.17:8080/business/job51_jobstats/actions/jobshow_list";
        private static final String output="E:\result.tsv";
    
        public String getDataFromWeb (String id) throws IOException {
    
            Document response = Jsoup.connect(url).timeout(12 * 1000).userAgent("Mozilla/5.0").method(Connection.Method.POST)
                    .ignoreContentType(true)
                    .cookie("JSESSIONID", "986C7BA4E6FE3DB5C4591F3481D3FF1D")
                    .header("Content-Type", "application/json;charset=UTF-8")
                    .data("a","b")
                    .requestBody("{"startTime":"20190624","endTime":"20190627","seType":"2","idType":"1","timeType":"1","startIndex":1,"offset":50,"id":"+id+"}")
                    .post();
            return response.text();
        }
    
    
        public static void main(String[] args)  throws Exception{
            SpiderUserPosTag sp=new SpiderUserPosTag();
            int n=0;
            int start=898440;
            BufferedWriter bw=new BufferedWriter(new FileWriter(new File(output),true));
            try {
                for (Integer id:idList) {
              //返回数据转化和解析,Map<String,List<Map<String,String>>> String line
    = sp.getDataFromWeb(String.valueOf(id)); Map<String,String> maps = JsonUtil.parseJSON2MapStr(line); String str2 = maps.get("result"); List<String> lists = JSONObject.parseArray(str2,String.class); for (String str3:lists) { Map<String,String> maps2 = JsonUtil.parseJSON2MapStr(str3); bw.write(StringUtil.joinString(" ",maps2.get("jobId"),maps2.get("jobName"),maps2.get("totalShowCount") ,maps2.get("totalClickCount"),maps2.get("totalApplyCount"),maps2.get("time"),maps2.get("webShowCount") ,maps2.get("webClickCount"),maps2.get("webApplyCount"),maps2.get("appShowCount"),maps2.get("appClickCount") ,maps2.get("appApplyCount"),maps2.get("mShowCount") ,maps2.get("mClickCount"),maps2.get("mApplyCount") ,maps2.get("showCount"),maps2.get("clickCount"),maps2.get("applyCount"))+" "); } } bw.flush(); bw.close(); } catch (IOException e){ e.printStackTrace(); } } }

    需要确定的三个元素:

    url:

    cookeid   和  请求body的格式:

    返回参数:

  • 相关阅读:
    mysql分表技术
    TP5.0 excel 导入导出
    整理:手机端弹出提示框,使用的bootstrap中的模态框(modal,弹出层),比kendo弹出效果好
    Bootstrap表单验证插件bootstrapValidator使用方法整理
    input属性为number时,如何去掉+、-号?
    input 属性为 number,maxlength不起作用如何解决?
    mysql给root开启远程访问权限
    thinkphp——通过在线编辑器添加的内容在模板里正确显示(只显示内容,而不是html代码)
    解决网站请求速度慢的一些方法
    JS封闭函数、闭包、内置对象
  • 原文地址:https://www.cnblogs.com/parent-absent-son/p/11317024.html
Copyright © 2011-2022 走看看