zoukankan      html  css  js  c++  java
  • java实现网页结构分析列表发现

    原文出处http://www.yund.tech/zdetail.html?type=1&id=ee06002e2b83e7677c30aedc52d3429e    

    作者:jstarseven 


      

    现在的网站千奇百怪,什么样格式的都有,需要提取网页中的列表数据,有时候挨个分析处理很头疼,本文是一个页面结构分析的程序,可以分析处理页面大致列表结构。

    废话不多说,我也不会说,show me code,code is terrible,so what  hahaha。-------jstarseven

    1.抽取元素dom结构框架

     1     /**
     2      * 分析元素dom结构框架
     3      *
     4      * @param node
     5      * @return
     6      */
     7     public String filterHtml(Element node) {
     8         //去除节点的属性值
     9         Document new_node = Jsoup.parse(node.outerHtml());
    10         Elements elements = new_node.getAllElements();
    11         for (Element item : elements) {
    12             Attributes attributes = item.attributes();
    13             for (Attribute a : attributes) {
    14                 if (a.getKey().equals(KeysEnum.attr_scroce)) {
    15                     item.removeAttr(a.getKey());
    16                     continue;
    17                 }
    18                 a.setValue(StringUtils.EMPTY);
    19             }
    20         }
    21         //去除注释节点,节点文本内容
    22         String str_new = new_node.outerHtml().replaceAll("<!--?(.*?)-->", "");
    23         str_new = str_new.replaceAll("\s*", "");
    24         str_new = str_new.replaceAll(">(.*?)<", "><");
    25         return str_new;
    26     }

    2.采用动态规划处理两个字符串相似度

     1 public class SimilarDegree {
     2 
     3     public static final double degree = 0.8;
     4 
     5     /**
     6      * 采用动态规划的方法解决
     7      *
     8      * @param source
     9      * @param target
    10      * @return
    11      */
    12     public static int EditDistance(String source, String target) {
    13         char[] sources = source.toCharArray();
    14         char[] targets = target.toCharArray();
    15         int sourceLen = sources.length;
    16         int targetLen = targets.length;
    17         int[][] d = new int[sourceLen + 1][targetLen + 1];
    18         for (int i = 0; i <= sourceLen; i++) {
    19             d[i][0] = i;
    20         }
    21         for (int i = 0; i <= targetLen; i++) {
    22             d[0][i] = i;
    23         }
    24 
    25         for (int i = 1; i <= sourceLen; i++) {
    26             for (int j = 1; j <= targetLen; j++) {
    27                 if (sources[i - 1] == targets[j - 1]) {
    28                     d[i][j] = d[i - 1][j - 1];
    29                 } else {
    30                     //插入
    31                     int insert = d[i][j - 1] + 1;
    32                     //删除
    33                     int delete = d[i - 1][j] + 1;
    34                     //替换
    35                     int replace = d[i - 1][j - 1] + 1;
    36                     d[i][j] = Math.min(insert, delete) > Math.min(delete, replace) ? Math.min(delete, replace) :
    37                             Math.min(insert, delete);
    38                 }
    39             }
    40         }
    41         return d[sourceLen][targetLen];
    42     }
    43 
    44     public static void main(String[] args) {
    45         System.out.println(EditDistance("html > body > ul > li.proiect_item:nth-child(1) > div.item_row.item_row_title > div:nth-child(1) > a",
    46                 "html > body > ul > li.proiect_item:nth-child(2) > div.item_row.item_row_title > div:nth-child(1) > a"));
    47     }
    48 
    49 }
    View Code

    3.对网页中每个节点的一级孩子节点分类

     1 /**
     2      * 统计列表下各个一级节点类型及个数
     3      *
     4      * @param node
     5      * @return
     6      */
     7     private Map<String, Integer> getGroupNode(Element node) {
     8         Map<String, Integer> map = new HashMap<String, Integer>();
     9         Elements children = node.children();
    10         for (Element item : children) {
    11             if (KeysEnum.input.equalsIgnoreCase(item.tagName()) || KeysEnum.br.equalsIgnoreCase(item.tagName())
    12                     || KeysEnum.script.equalsIgnoreCase(item.tagName()) || KeysEnum.link.equalsIgnoreCase(item.tagName())
    13                     || KeysEnum.style.equalsIgnoreCase(item.tagName()) || KeysEnum.meta.equalsIgnoreCase(item.tagName())
    14                     || KeysEnum.select.equalsIgnoreCase(item.tagName()) || KeysEnum.option.equalsIgnoreCase(item.tagName())
    15                     || KeysEnum.video.equals(item.tagName()) || KeysEnum.audio.equals(item.tagName())
    16                     || KeysEnum.textarea.equals(item.tagName())) continue;
    17             String key = filterHtml(item);
    18             if (map.containsKey(key)) {
    19                 map.put(key, (Integer) map.get(key) + 1);
    20             } else {
    21                 boolean is_like = false;
    22                 for (String map_key : map.keySet()) {
    23                     int dis = SimilarDegree.EditDistance(key, (String) map_key);
    24                     float v = (float) (key.length() - dis) / key.length();
    25                     if (v > SimilarDegree.degree) {
    26                         map.put(map_key, (Integer) map.get(map_key) + 1);
    27                         is_like = true;
    28                         break;
    29                     }
    30                 }
    31                 if (!is_like) map.put(key, 1);
    32             }
    33         }
    34         return map;
    35     }

    4.处理网页中每个元素的叶子节点

     1 /**
     2      * 获取叶子节点选择器
     3      *
     4      * @param node
     5      */
     6     public static List<String> getYeziNodeSel(Element node) {
     7         List<String> list = new ArrayList<String>();
     8         Elements all = node.getAllElements();
     9         for (Element item : all) {
    10             if (item.children().isEmpty()) list.add(item.cssSelector());
    11         }
    12         return list;
    13     }

    5.时间提取工具类

      1 /**
      2  * jstarseven
      3  * 通用时间处理类  return Date
      4  * */
      5 public class DateParser {
      6     private static int timezone = 0;
      7     private static final Pattern[] DPTN = {
      8 
      9             Pattern.compile(
     10                     "(\d{1,2})[\s\-\/](\d{1,2})[\s\-\/](20\d{2})\s{0,2}((\d{1,2})[:\s](\d{1,2})[:\s]?(\d{1,2})?)?"),
     11 
     12             Pattern.compile(
     13                     "((20)?\d{2}) {0,2}[\.\-/年] {0,2}(\d{1,2}) {0,2}[\.\-/月] {0,2}(\d{1,2}) {0,2}[日 \s]{0,2}((上午)|(下午))?\s{0,2}((\d{1,2})[:\s时](\d{1,2})[:\s分]?(\d{1,2})?)?"),
     14 
     15             Pattern.compile("((20)?\d{2})/(\d{2})(\d{2})"),
     16 
     17             Pattern.compile(
     18                     "(\d{1,2})[\.\-\s/月](\d{1,2})[日\s]{0,2}((上午)|(下午))?\s{0,2}((\d{1,2})[:\s](\d{1,2})[:\s]?(\d{1,2})?)?"),
     19 
     20             Pattern.compile("([今前昨]天)?\s{0,4}(\d{1,2})[:\s]{1,3}(\d{1,2})[:\s]?(\d{1,2})?"),
     21 
     22             Pattern.compile("[今前昨]天"),
     23 
     24             Pattern.compile("((\d{1,2})|(半))\s*个?([天秒小时分钟周月年]{1,2})前"),
     25 
     26             Pattern.compile("(\d{1,2})小?时(\d{1,2})分钟?前"),
     27 
     28             Pattern.compile("(20\d{2})[01]?(\d{2})[012]?(\d{2})") };
     29 
     30     public static Date parse(Object obj) {
     31         if (obj == null) {
     32             return null;
     33         }
     34         if ((obj instanceof Date)) {
     35             return (Date) obj;
     36         }
     37         if ((obj instanceof Number)) {
     38             return new Date(((Number) obj).longValue());
     39         }
     40         String str = ((String) obj).trim();
     41         if ((str.length() == 0) || ("null".equalsIgnoreCase(str))) {
     42             return null;
     43         }
     44         str = transZH(str);
     45         Calendar c = Calendar.getInstance();
     46         c.setTimeInMillis(System.currentTimeMillis());
     47 
     48         Matcher mt = DPTN[0].matcher(str);
     49         if (mt.find()) {
     50             int date = Integer.parseInt(mt.group(2));
     51             if ((date == 0) || (date > 31)) {
     52                 return null;
     53             }
     54             int month = Integer.parseInt(mt.group(1));
     55             if (month <= 0) {
     56                 return null;
     57             }
     58             if (month > 12) {
     59                 if ((date > 0) && (date <= 12) && (month < 32)) {
     60                     int tmp = month;
     61                     month = date;
     62                     date = tmp;
     63                 } else {
     64                     return null;
     65                 }
     66             }
     67             String sy = mt.group(3);
     68             int year = Integer.parseInt(sy);
     69             if ((year < 2000) || (year > 2099)) {
     70                 return null;
     71             }
     72             String hms = mt.group(4);
     73             if ((hms == null) || (hms.length() == 0)) {
     74                 c.set(year, month - 1, date, timezone > 0 ? timezone : 0, 0, 0);
     75                 return c.getTime();
     76             }
     77             int hour = Integer.parseInt(mt.group(5));
     78             if (hour >= 24) {
     79                 return null;
     80             }
     81             int min = Integer.parseInt(mt.group(6));
     82             if (min >= 60) {
     83                 return null;
     84             }
     85             String ssec = mt.group(7);
     86             int sec = (ssec == null) || (ssec.length() == 0) ? 0 : Integer.parseInt(ssec);
     87             c.set(year, month - 1, date, hour, min, sec);
     88             return c.getTime();
     89         }
     90         mt = DPTN[1].matcher(str);
     91         if (mt.find()) {
     92             String sy = mt.group(1);
     93             if (sy.length() == 2) {
     94                 sy = "20" + sy;
     95             }
     96             int year = Integer.parseInt(sy);
     97             if ((year < 2000) || (year > 2099)) {
     98                 return null;
     99             }
    100             int month = Integer.parseInt(mt.group(3)) - 1;
    101             if ((month < 0) || (month > 11)) {
    102                 return null;
    103             }
    104             int date = Integer.parseInt(mt.group(4));
    105             if (date > 31) {
    106                 return null;
    107             }
    108             String ss = mt.group(8);
    109             if ((ss == null) || (ss.length() == 0)) {
    110                 c.set(year, month, date, timezone > 0 ? timezone : 0, 0, 0);
    111                 return c.getTime();
    112             }
    113             int hour = Integer.parseInt(mt.group(9));
    114             if (hour >= 24) {
    115                 return null;
    116             }
    117             int min = Integer.parseInt(mt.group(10));
    118             if (min >= 60) {
    119                 return null;
    120             }
    121             String ssec = mt.group(11);
    122             int sec = (ssec == null) || (ssec.length() == 0) ? 0 : Integer.parseInt(ssec);
    123             if (("下午".equals(mt.group(5))) && (hour < 12)) {
    124                 hour += 12;
    125             }
    126             c.set(year, month, date, hour, min, sec);
    127             return c.getTime();
    128         }
    129         mt = DPTN[2].matcher(str);
    130         if (mt.find()) {
    131             String strYear = mt.group(1);
    132             if (!strYear.startsWith("20")) {
    133                 strYear = "20" + strYear;
    134             }
    135             int year = Integer.parseInt(strYear);
    136             int month = Integer.parseInt(mt.group(3)) - 1;
    137             int day = Integer.parseInt(mt.group(4));
    138             c.set(year, month, day, 0, 0, 0);
    139             return c.getTime();
    140         }
    141         mt = DPTN[3].matcher(str);
    142         if (mt.find()) {
    143             int year = c.get(1);
    144             int month = Integer.parseInt(mt.group(1)) - 1;
    145             if (month < 0) {
    146                 return null;
    147             }
    148             if (month > c.get(2)) {
    149                 year--;
    150             }
    151             int date = Integer.parseInt(mt.group(2));
    152             if (date > 31) {
    153                 return null;
    154             }
    155             String p = mt.group(6);
    156             if ((p == null) || (p.length() == 0)) {
    157                 c.set(year, month, date, timezone > 0 ? timezone : 0, 0, 0);
    158                 return c.getTime();
    159             }
    160             int hour = Integer.parseInt(mt.group(7));
    161             if (hour >= 24) {
    162                 return null;
    163             }
    164             int min = Integer.parseInt(mt.group(8));
    165             if (min >= 60) {
    166                 return null;
    167             }
    168             String ssec = mt.group(9);
    169             int sec = (ssec == null) || (ssec.length() == 0) ? 0 : Integer.parseInt(ssec);
    170             if (("下午".equals(mt.group(3))) && (hour < 12)) {
    171                 hour += 12;
    172             }
    173             c.set(year, month, date, hour, min, sec);
    174             return c.getTime();
    175         }
    176         mt = DPTN[4].matcher(str);
    177         if (mt.find()) {
    178             int hour = Integer.parseInt(mt.group(2));
    179             if (hour >= 24) {
    180                 return null;
    181             }
    182             int min = Integer.parseInt(mt.group(3));
    183             if (min >= 60) {
    184                 return null;
    185             }
    186             String day = mt.group(1);
    187             if ("昨天".equals(day)) {
    188                 c.add(5, -1);
    189             } else if ("前天".equals(day)) {
    190                 c.add(5, -2);
    191             }
    192             c.set(11, hour);
    193             c.set(12, min);
    194             return c.getTime();
    195         }
    196         mt = DPTN[5].matcher(str);
    197         if (mt.find()) {
    198             String day = mt.group(0);
    199             if ("昨天".equals(day)) {
    200                 c.add(5, -1);
    201             } else if ("前天".equals(day)) {
    202                 c.add(5, -2);
    203             }
    204             return c.getTime();
    205         }
    206         mt = DPTN[6].matcher(str);
    207         if (mt.find()) {
    208             String s = mt.group(4);
    209             long t;
    210             if ("年".equals(s)) {
    211                 t = 31536000000L;
    212             } else {
    213                 if ("月".equals(s)) {
    214                     t = 2592000000L;
    215                 } else {
    216                     if ("周".equals(s)) {
    217                         t = 604800000L;
    218                     } else {
    219                         if ("天".equals(s)) {
    220                             t = 86400000L;
    221                         } else {
    222                             if ("小时".equals(s)) {
    223                                 t = 3600000L;
    224                             } else {
    225                                 if ("时".equals(s)) {
    226                                     t = 3600000L;
    227                                 } else {
    228                                     if ("分钟".equals(s)) {
    229                                         t = 60000L;
    230                                     } else {
    231                                         if ("分".equals(s)) {
    232                                             t = 60000L;
    233                                         } else {
    234                                             if ("秒".equals(s)) {
    235                                                 t = 1000L;
    236                                             } else {
    237                                                 return null;
    238                                             }
    239                                         }
    240                                     }
    241                                 }
    242                             }
    243                         }
    244                     }
    245                 }
    246             }
    247             String vs = mt.group(1);
    248             if ("半".equals(vs)) {
    249                 t = System.currentTimeMillis() - t / 2L;
    250             } else {
    251                 t = System.currentTimeMillis() - Integer.parseInt(vs) * t;
    252             }
    253             return new Date(t);
    254         }
    255         mt = DPTN[7].matcher(str);
    256         if (mt.find()) {
    257             int hh = Integer.parseInt(mt.group(1));
    258             int nn = Integer.parseInt(mt.group(2));
    259             long t = 3600000 * hh + 60000 * nn;
    260             return new Date(System.currentTimeMillis() - t);
    261         }
    262         mt = DPTN[8].matcher(str);
    263         if (mt.find()) {
    264             String sy = mt.group(1);
    265             int year = Integer.parseInt(sy);
    266             if ((year < 2000) || (year > 2099)) {
    267                 return null;
    268             }
    269             int month = Integer.parseInt(mt.group(2)) - 1;
    270             if ((month < 0) || (month > 11)) {
    271                 return null;
    272             }
    273             int date = Integer.parseInt(mt.group(3));
    274             if (date > 31) {
    275                 return null;
    276             }
    277             c.set(year, month, date, timezone > 0 ? timezone : 0, 0, 0);
    278             return c.getTime();
    279         }
    280         return null;
    281     }
    282 
    283     private static String transZH(String string) {
    284         String zh = "〇一二三四五六七八九";
    285         string = string.replace("整", "0分").replaceAll("[上下]午", "");
    286         StringBuffer buffer = new StringBuffer();
    287         for (Character Char : string.toCharArray()) {
    288             int index = zh.indexOf(Char);
    289             if (index >= 0) {
    290                 buffer.append(index);
    291             } else {
    292                 buffer.append(Char);
    293             }
    294         }
    295         String str = buffer.toString();
    296         int index = str.indexOf("十");
    297         if (index == -1) {
    298             return str;
    299         } else {
    300             if (!Character.isDigit(str.charAt(index-1)) && !Character.isDigit(str.charAt(index+1))) {
    301                 str=str.replace("十", "10");
    302             }else if (Character.isDigit(str.charAt(index-1)) && !Character.isDigit(str.charAt(index+1))) {
    303                 str=str.replace("十", "0");
    304             }else if(!Character.isDigit(str.charAt(index-1)) && Character.isDigit(str.charAt(index+1))){
    305                 str=str.replace("十", "1");
    306             }else if(Character.isDigit(str.charAt(index-1)) && Character.isDigit(str.charAt(index+1))){
    307                 str=str.replace("十", "");
    308             }
    309             return str;
    310         }
    311         
    312     }
    313 
    314     public static void main(String[] args) {
    315         System.out.println(parse("1982-01-01 00:00:00"));
    316         System.out.println(transZH("二〇一七年九月十日 上午十时整"));
    317         System.out.println(transZH("二〇一七年九月二十日 上午九时整"));
    318         System.out.println(transZH("二〇一七年九月十九日 上午九时整"));
    319         System.out.println(transZH("二〇一七年九月二十三日 上午九时整"));
    320         System.out.println("timezone=" + timezone);
    321         String[] testdata = { "1982-01-01 00:00:00","11-13 15:24", "2009-8-30 16:42:10", "8-23 15:24", "2周前", "3  天前", "12  分钟前", "3天前",
    322                 "前天  09:36", "昨天 09:21 ", "2010-12-17 00:23 ", "2010-12-17 ", "昨天 12:37 ", "2011-8-15 08:42",
    323                 "25-7-2011 11:43:57", "1-9-2011", "06-03", "半小时前", "今天发表", "昨天发表", "前天发表", "06-03-2010",
    324                 "02-01-2010 00:39", "3小时26分钟前", "2010-8-24 上午 01:17:32", "2010-8-24 下午 01:17:32", "7小时前   »",
    325                 "4/29/2010 1:31:00", "2012 年 1 月 31 日", "17时20分前", "2017年10月12日 14时30分", "二〇一七年九月十九日 上午九时整" };
    326 
    327         DateFormat df = DateFormat.getDateTimeInstance(2, 2);
    328         for (String s : testdata) {
    329             Date d = parse(s);
    330             System.out.println(s + "		" + (d == null ? d : df.format(d)));
    331         }
    332     }
    333 
    334 }
    View Code

    6.自定义比较器对网页所有元素排序,发现结果靠前的基本都是列表元素

      比较器:按照疑似列表的可能性

     1 /**
     2      * 排序子节点
     3      * 1.最大相同dom结构长度
     4      * 2.最大相同dom结构元素数量
     5      *
     6      * @param nodes
     7      * @return
     8      */
     9     private Elements sortBy(Elements nodes, String base_url) {
    10 //        System.setProperty("java.util.Arrays.useLegacyMergeSort", "true");
    11         nodes.sort(new Comparator<Element>() {
    12             @Override
    13             public int compare(Element o1, Element o2) {
    14                 double o1_rate = reckonRate(o1);
    15                 double o2_rate = reckonRate(o2);
    16                 return (o2_rate > o1_rate) ? 1 : ((o2_rate == o1_rate) ? 0 : -1);
    17             }
    18 
    19             private double reckonRate(Element o) {
    20                 if (StringUtils.isNotBlank(base_url) && KeysEnum.a.equalsIgnoreCase(o.tagName()) && base_url.equalsIgnoreCase(o.attr(KeysEnum.attr_href)))
    21                     o.attr(KeysEnum.attr_list_tag_name, o.text());
    22                 if (null == o || o.children().size() < 2
    23                         || KeysEnum.html.equalsIgnoreCase(o.tagName()) || KeysEnum.body.equalsIgnoreCase(o.tagName()) || KeysEnum.link.equalsIgnoreCase(o.tagName())
    24                         || KeysEnum.head.equalsIgnoreCase(o.tagName()) || KeysEnum.title.equalsIgnoreCase(o.tagName()) || KeysEnum.meta.equalsIgnoreCase(o.tagName())
    25                         || KeysEnum.script.equalsIgnoreCase(o.tagName()) || KeysEnum.style.equalsIgnoreCase(o.tagName())) {
    26                     o.attr(KeysEnum.attr_scroce, "0");
    27                     return 0;
    28                 }
    29                 String style = o.attr(KeysEnum.style);
    30                 if (StringUtils.isNotBlank(style) && style.contains(KeysEnum.display_none)) {
    31                     o.attr(KeysEnum.attr_scroce, "0");
    32                     return 0;
    33                 }
    34                 Map<String, Object> maxKeyDom = getMaxKeyDom(o);
    35                 String key = (String) maxKeyDom.get(KeysEnum.max_key);
    36                 int num = (int) maxKeyDom.get(KeysEnum.max_num);
    37                 if (num < 2) {
    38                     o.attr(KeysEnum.attr_scroce, "0");
    39                     return 0;
    40                 }
    41                 int scroce = num * key.length();
    42                 Elements tags = o.children();
    43                 for (Element a : tags) {
    44                     if (KeysEnum.div.equalsIgnoreCase(a.tagName())) scroce += 5;
    45                     if (KeysEnum.ul.equalsIgnoreCase(a.tagName())) scroce += 10;
    46                     if (KeysEnum.li.equalsIgnoreCase(a.tagName())) scroce += 10;
    47                     if (KeysEnum.tbody.equalsIgnoreCase(a.tagName())) scroce += 5;
    48                     if (KeysEnum.table.equalsIgnoreCase(a.tagName())) scroce += 5;
    49                     if (KeysEnum.tr.equalsIgnoreCase(a.tagName())) scroce += 10;
    50                     if (KeysEnum.td.equalsIgnoreCase(a.tagName())) scroce += 1;
    51                     if (KeysEnum.a.equalsIgnoreCase(a.tagName())) scroce += 1;
    52                     if (KeysEnum.p.equalsIgnoreCase(a.tagName())) scroce += 1;
    53                     try {
    54                         Date time = DateParser.parse(a.text());
    55                         if (null != time) scroce += 20;
    56                     } catch (Exception e) {
    57                     }
    58                 }
    59                 if (o.text().contains(KeysEnum.next_page)) scroce += 100;
    60                 if (o.text().contains(KeysEnum.start_page) || o.text().contains(KeysEnum.fisrt_page)) scroce += 100;
    61                 if (o.text().contains(KeysEnum.end_page) || o.text().contains(KeysEnum.last_page) || o.text().contains(KeysEnum.final_page))
    62                     scroce += 100;
    63                 o.attr(KeysEnum.attr_scroce, String.valueOf(scroce));
    64                 return scroce;
    65             }
    66         });
    67         return nodes;
    68     }

    7.处理页面html,调用列表分析返回json结果

      1  /**
      2      * 提取页面列表元素的选择器以及页面分类标签
      3      *
      4      * @param document
      5      * @param is_subitem
      6      * @return
      7      */
      8     public static Map<String, Object> dealListNode(Document document, boolean is_subitem) throws Exception {
      9         Map<String, Object> result = new HashMap<String, Object>();
     10         try {
     11             ListAutoFire listAutoFire = new ListAutoFire();
     12             Elements list_node = listAutoFire.autoFireListNodes(document);
     13             List<Map<String, Object>> lists = new ArrayList();
     14             if (null != list_node && list_node.size() > 0) {
     15                 for (Element list_sel_item : list_node) {
     16                     if (list_sel_item.hasAttr(KeysEnum.attr_list_tag_name) && StringUtils.isNotBlank(list_sel_item.attr(KeysEnum.attr_list_tag_name))) {
     17                         result.put(KeysEnum.tag_name, list_sel_item.attr(KeysEnum.attr_list_tag_name));
     18                         continue;
     19                     }
     20                     Map<String, Object> list_dom_frame = new HashMap<>();
     21                     list_dom_frame.put(KeysEnum.list_sel, list_sel_item.cssSelector());
     22                     if (is_subitem) {
     23                         Map<String, List<String>> listItem = new HashMap<String, List<String>>();
     24                         for (Element item : list_sel_item.children())
     25                             listItem.put(item.cssSelector(), getYeziNodeSel(item));
     26                         list_dom_frame.put(KeysEnum.list_dom, listItem);
     27                     }
     28                     list_dom_frame.put(KeysEnum.attr_scroce, list_sel_item.attr(KeysEnum.attr_scroce));
     29                     lists.add(list_dom_frame);
     30                 }
     31             }
     32             result.put(KeysEnum.list, lists);
     33         } catch (Exception e) {
     34             throw new Exception(KeysEnum.error_info, e.getCause());
     35         }
     36         return result;
     37     }
     38 
     39     /**
     40      * 处理网页结构
     41      *
     42      * @param home_url   入口地址
     43      * @param list_index 列表元素获取数量
     44      * @param is_subitem 是否处理列表元素子项抽取 true/false
     45      * @param is_ifr     是否处理iframe true/false
     46      * @return
     47      */
     48     public static Map<String, Object> getWebSiteFrame(String home_url, int list_index, boolean is_subitem, boolean is_ifr) {
     49         Map<String, Object> result = new HashMap<String, Object>();
     50         if (StringUtils.isBlank(home_url)) return result;
     51         try {
     52             Document html = Jsoup.connect(home_url).ignoreContentType(true).validateTLSCertificates(false).timeout(5000).get();
     53             if (null == html) throw new Exception(KeysEnum.open_fail);
     54             Map<String, Object> mapNode = dealListNode(html, is_subitem);
     55             List listNode = (List) mapNode.get(KeysEnum.list);
     56             result.put(KeysEnum.home_url, home_url);
     57             result.put(KeysEnum.tag_name, mapNode.get(KeysEnum.tag_name));
     58             result.put(KeysEnum.list, listNode.subList(0, listNode.size() > list_index ? list_index : listNode.size()));
     59             result.put(KeysEnum.ifrs, new ArrayList());
     60             if (is_ifr) {
     61                 List<Map<String, Object>> ifrs = (List<Map<String, Object>>) result.get(KeysEnum.ifrs);
     62                 Elements iframe_nodes = html.getElementsByTag(KeysEnum.iframe);
     63                 if (null != iframe_nodes) {
     64                     for (Element iframe : iframe_nodes) {
     65                         String iframe_url = iframe.attr(KeysEnum.attr_src);
     66                         if (StringUtils.isBlank(iframe_url)) continue;
     67                         try {
     68                             Document iframe_html = Jsoup.connect(iframe_url).ignoreContentType(true).validateTLSCertificates(false).timeout(5000).get();
     69                             if (null == iframe_html) continue;
     70                             Map<String, Object> ifrMapNode = dealListNode(iframe_html, is_subitem);
     71                             List ifrListNode = (List) ifrMapNode.get(KeysEnum.list);
     72                             Map<String, Object> ifr_map = new HashMap();
     73                             ifr_map.put(KeysEnum.home_url, iframe_url);
     74                             ifr_map.put(KeysEnum.tag_name, ifrMapNode.get(KeysEnum.tag_name));
     75                             ifr_map.put(KeysEnum.list, ifrListNode.subList(0, ifrListNode.size() > list_index ? list_index : ifrListNode.size()));
     76                             ifrs.add(ifr_map);
     77                         } catch (Exception e) {
     78                             e.printStackTrace();
     79                         }
     80                     }
     81                 }
     82             }
     83         } catch (Exception e) {
     84             e.printStackTrace();
     85             result.clear();
     86             result.put(KeysEnum.home_url, home_url);
     87             result.put(KeysEnum.error, KeysEnum.error_info);
     88             result.put(KeysEnum.message, e.toString());
     89         }
     90         return result;
     91     }
     92 
     93     /**
     94      * 处理网页结构
     95      *
     96      * @param home_url   入口地址
     97      * @param list_index 列表元素获取数量
     98      * @param is_subitem 是否处理列表元素子项抽取 true/false
     99      * @return
    100      */
    101     public static Map<String, Object> getWebSiteFrame(String home_url, int list_index, boolean is_subitem) {
    102         return getWebSiteFrame(home_url, list_index, is_subitem, false);
    103     }
    104 
    105     /**
    106      * 处理网页结构
    107      *
    108      * @param home_url   入口地址
    109      * @param list_index 列表元素获取数量
    110      * @return
    111      */
    112     public static Map<String, Object> getWebSiteFrame(String home_url, int list_index) {
    113         return getWebSiteFrame(home_url, list_index, false);
    114     }
    115 
    116     /**
    117      * 处理网页结构
    118      *
    119      * @param home_url 入口地址
    120      * @return
    121      */
    122     public static Map<String, Object> getWebSiteFrame(String home_url) {
    123         return getWebSiteFrame(home_url, 10);
    124     }
    View Code

    8.生成页面分析结果标记文件

     1 public static void createMarkFile(Map siteFrame, String home_url, String path) {
     2         try {
     3             Document doc = Jsoup.connect(home_url).ignoreContentType(true).validateTLSCertificates(false).timeout(5000).get();
     4             if (null == doc) return;
     5             String style = ".mark_color {" +
     6                     "position:relative;" +
     7                     "pointer-events:none;" +
     8                     "left:0px;top:0px;" +
     9                     "display:inline-block;" +
    10                     "margin:-2px;100%;" +
    11                     "height:100%;" +
    12                     "border:dashed 2px #FF69B4;" +
    13                     "background-color: #43CD80;" +
    14                     "opacity:0.75;" +
    15                     "} " ;
    16             List list = (List) siteFrame.get("list");
    17             for (Object item : list) {
    18                 Map item_map = (Map) item;
    19                 String sel = (String) item_map.get("list_sel");
    20                 doc.select(sel).addClass("mark_color");
    21             }
    22             String content = doc.html();
    23             content = content.contains("<base") ? content : content.replaceFirst("<head", "<base href='" + home_url + "'/><style>" + style + "</style><head");
    24             FileUtils.writeStringToFile(new File(path), content, "UTF-8", false);
    25 
    26         } catch (IOException e) {
    27             e.printStackTrace();
    28         }
    29     }
    View Code

    9.上述第7步返回的结果实例:

    拿cnblog首页做测试,返回结果:

    字段解释:

    home_url :分析的页面地址

    tag_name :当前页面的类型,多数情况下不正确,我只是拿home_url和页面的url比对,取了对应的text

    list:页面中疑似列表元素

          list_sel:页面中疑似列表元素的选择器

          list_dom:页面中疑似列表元素的 一级孩子节点元素,叶子元素选择器

    ifrs:页面中包含iframe分析的结果,没有则为空

      1 {
      2   "home_url": "https://www.cnblogs.com/",
      3   "tag_name": "1",
      4   "list": [
      5     {
      6       "list_sel": "#post_list",
      7       "list_dom": {
      8         "#post_list > div.post_item:nth-child(7)": [
      9           "#digg_count_9500831",
     10           "#post_list > div.post_item:nth-child(7) > div.digg > div.clear",
     11           "#digg_tip_9500831",
     12           "#post_list > div.post_item:nth-child(7) > div.post_item_body > h3 > a.titlelnk",
     13           "#post_list > div.post_item:nth-child(7) > div.post_item_body > p.post_item_summary",
     14           "#post_list > div.post_item:nth-child(7) > div.post_item_body > div.post_item_foot > a.lightblue",
     15           "#post_list > div.post_item:nth-child(7) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
     16           "#post_list > div.post_item:nth-child(7) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
     17           "#post_list > div.post_item:nth-child(7) > div.clear"
     18         ],
     19         "#post_list > div.post_item:nth-child(19)": [
     20           "#digg_count_9499348",
     21           "#post_list > div.post_item:nth-child(19) > div.digg > div.clear",
     22           "#digg_tip_9499348",
     23           "#post_list > div.post_item:nth-child(19) > div.post_item_body > h3 > a.titlelnk",
     24           "#post_list > div.post_item:nth-child(19) > div.post_item_body > p.post_item_summary > a > img.pfs",
     25           "#post_list > div.post_item:nth-child(19) > div.post_item_body > div.post_item_foot > a.lightblue",
     26           "#post_list > div.post_item:nth-child(19) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
     27           "#post_list > div.post_item:nth-child(19) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
     28           "#post_list > div.post_item:nth-child(19) > div.clear"
     29         ],
     30         "#post_list > div.post_item:nth-child(6)": [
     31           "#digg_count_9500833",
     32           "#post_list > div.post_item:nth-child(6) > div.digg > div.clear",
     33           "#digg_tip_9500833",
     34           "#post_list > div.post_item:nth-child(6) > div.post_item_body > h3 > a.titlelnk",
     35           "#post_list > div.post_item:nth-child(6) > div.post_item_body > p.post_item_summary",
     36           "#post_list > div.post_item:nth-child(6) > div.post_item_body > div.post_item_foot > a.lightblue",
     37           "#post_list > div.post_item:nth-child(6) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
     38           "#post_list > div.post_item:nth-child(6) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
     39           "#post_list > div.post_item:nth-child(6) > div.clear"
     40         ],
     41         "#post_list > div.post_item:nth-child(9)": [
     42           "#digg_count_9500757",
     43           "#post_list > div.post_item:nth-child(9) > div.digg > div.clear",
     44           "#digg_tip_9500757",
     45           "#post_list > div.post_item:nth-child(9) > div.post_item_body > h3 > a.titlelnk",
     46           "#post_list > div.post_item:nth-child(9) > div.post_item_body > p.post_item_summary > a > img.pfs",
     47           "#post_list > div.post_item:nth-child(9) > div.post_item_body > div.post_item_foot > a.lightblue",
     48           "#post_list > div.post_item:nth-child(9) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
     49           "#post_list > div.post_item:nth-child(9) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
     50           "#post_list > div.post_item:nth-child(9) > div.clear"
     51         ],
     52         "#post_list > div.post_item:nth-child(17)": [
     53           "#digg_count_9495616",
     54           "#post_list > div.post_item:nth-child(17) > div.digg > div.clear",
     55           "#digg_tip_9495616",
     56           "#post_list > div.post_item:nth-child(17) > div.post_item_body > h3 > a.titlelnk",
     57           "#post_list > div.post_item:nth-child(17) > div.post_item_body > p.post_item_summary > a > img.pfs",
     58           "#post_list > div.post_item:nth-child(17) > div.post_item_body > div.post_item_foot > a.lightblue",
     59           "#post_list > div.post_item:nth-child(17) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
     60           "#post_list > div.post_item:nth-child(17) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
     61           "#post_list > div.post_item:nth-child(17) > div.clear"
     62         ],
     63         "#post_list > div.post_item:nth-child(8)": [
     64           "#digg_count_9500822",
     65           "#post_list > div.post_item:nth-child(8) > div.digg > div.clear",
     66           "#digg_tip_9500822",
     67           "#post_list > div.post_item:nth-child(8) > div.post_item_body > h3 > a.titlelnk",
     68           "#post_list > div.post_item:nth-child(8) > div.post_item_body > p.post_item_summary > a > img.pfs",
     69           "#post_list > div.post_item:nth-child(8) > div.post_item_body > div.post_item_foot > a.lightblue",
     70           "#post_list > div.post_item:nth-child(8) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
     71           "#post_list > div.post_item:nth-child(8) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
     72           "#post_list > div.post_item:nth-child(8) > div.clear"
     73         ],
     74         "#post_list > div.post_item:nth-child(18)": [
     75           "#digg_count_9499454",
     76           "#post_list > div.post_item:nth-child(18) > div.digg > div.clear",
     77           "#digg_tip_9499454",
     78           "#post_list > div.post_item:nth-child(18) > div.post_item_body > h3 > a.titlelnk",
     79           "#post_list > div.post_item:nth-child(18) > div.post_item_body > p.post_item_summary > a > img.pfs",
     80           "#post_list > div.post_item:nth-child(18) > div.post_item_body > div.post_item_foot > a.lightblue",
     81           "#post_list > div.post_item:nth-child(18) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
     82           "#post_list > div.post_item:nth-child(18) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
     83           "#post_list > div.post_item:nth-child(18) > div.clear"
     84         ],
     85         "#post_list > div.post_item:nth-child(3)": [
     86           "#digg_count_9500944",
     87           "#post_list > div.post_item:nth-child(3) > div.digg > div.clear",
     88           "#digg_tip_9500944",
     89           "#post_list > div.post_item:nth-child(3) > div.post_item_body > h3 > a.titlelnk",
     90           "#post_list > div.post_item:nth-child(3) > div.post_item_body > p.post_item_summary > a > img.pfs",
     91           "#post_list > div.post_item:nth-child(3) > div.post_item_body > div.post_item_foot > a.lightblue",
     92           "#post_list > div.post_item:nth-child(3) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
     93           "#post_list > div.post_item:nth-child(3) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
     94           "#post_list > div.post_item:nth-child(3) > div.clear"
     95         ],
     96         "#post_list > div.post_item:nth-child(2)": [
     97           "#digg_count_9500357",
     98           "#post_list > div.post_item:nth-child(2) > div.digg > div.clear",
     99           "#digg_tip_9500357",
    100           "#post_list > div.post_item:nth-child(2) > div.post_item_body > h3 > a.titlelnk",
    101           "#post_list > div.post_item:nth-child(2) > div.post_item_body > p.post_item_summary",
    102           "#post_list > div.post_item:nth-child(2) > div.post_item_body > div.post_item_foot > a.lightblue",
    103           "#post_list > div.post_item:nth-child(2) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
    104           "#post_list > div.post_item:nth-child(2) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
    105           "#post_list > div.post_item:nth-child(2) > div.clear"
    106         ],
    107         "#post_list > div.post_item:nth-child(5)": [
    108           "#digg_count_9500890",
    109           "#post_list > div.post_item:nth-child(5) > div.digg > div.clear",
    110           "#digg_tip_9500890",
    111           "#post_list > div.post_item:nth-child(5) > div.post_item_body > h3 > a.titlelnk",
    112           "#post_list > div.post_item:nth-child(5) > div.post_item_body > p.post_item_summary > a > img.pfs",
    113           "#post_list > div.post_item:nth-child(5) > div.post_item_body > div.post_item_foot > a.lightblue",
    114           "#post_list > div.post_item:nth-child(5) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
    115           "#post_list > div.post_item:nth-child(5) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
    116           "#post_list > div.post_item:nth-child(5) > div.clear"
    117         ],
    118         "#post_list > div.post_item:nth-child(4)": [
    119           "#digg_count_9500935",
    120           "#post_list > div.post_item:nth-child(4) > div.digg > div.clear",
    121           "#digg_tip_9500935",
    122           "#post_list > div.post_item:nth-child(4) > div.post_item_body > h3 > a.titlelnk",
    123           "#post_list > div.post_item:nth-child(4) > div.post_item_body > p.post_item_summary > a > img.pfs",
    124           "#post_list > div.post_item:nth-child(4) > div.post_item_body > div.post_item_foot > a.lightblue",
    125           "#post_list > div.post_item:nth-child(4) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
    126           "#post_list > div.post_item:nth-child(4) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
    127           "#post_list > div.post_item:nth-child(4) > div.clear"
    128         ],
    129         "#post_list > div.post_item:nth-child(1)": [
    130           "#digg_count_9501071",
    131           "#post_list > div.post_item:nth-child(1) > div.digg > div.clear",
    132           "#digg_tip_9501071",
    133           "#post_list > div.post_item:nth-child(1) > div.post_item_body > h3 > a.titlelnk",
    134           "#post_list > div.post_item:nth-child(1) > div.post_item_body > p.post_item_summary > a > img.pfs",
    135           "#post_list > div.post_item:nth-child(1) > div.post_item_body > div.post_item_foot > a.lightblue",
    136           "#post_list > div.post_item:nth-child(1) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
    137           "#post_list > div.post_item:nth-child(1) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
    138           "#post_list > div.post_item:nth-child(1) > div.clear"
    139         ],
    140         "#post_list > div.post_item:nth-child(15)": [
    141           "#digg_count_9403762",
    142           "#post_list > div.post_item:nth-child(15) > div.digg > div.clear",
    143           "#digg_tip_9403762",
    144           "#post_list > div.post_item:nth-child(15) > div.post_item_body > h3 > a.titlelnk",
    145           "#post_list > div.post_item:nth-child(15) > div.post_item_body > p.post_item_summary > a > img.pfs",
    146           "#post_list > div.post_item:nth-child(15) > div.post_item_body > div.post_item_foot > a.lightblue",
    147           "#post_list > div.post_item:nth-child(15) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
    148           "#post_list > div.post_item:nth-child(15) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
    149           "#post_list > div.post_item:nth-child(15) > div.clear"
    150         ],
    151         "#post_list > div.post_item:nth-child(16)": [
    152           "#digg_count_9499534",
    153           "#post_list > div.post_item:nth-child(16) > div.digg > div.clear",
    154           "#digg_tip_9499534",
    155           "#post_list > div.post_item:nth-child(16) > div.post_item_body > h3 > a.titlelnk",
    156           "#post_list > div.post_item:nth-child(16) > div.post_item_body > p.post_item_summary > a > img.pfs",
    157           "#post_list > div.post_item:nth-child(16) > div.post_item_body > div.post_item_foot > a.lightblue",
    158           "#post_list > div.post_item:nth-child(16) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
    159           "#post_list > div.post_item:nth-child(16) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
    160           "#post_list > div.post_item:nth-child(16) > div.clear"
    161         ],
    162         "#post_list > div.post_item:nth-child(13)": [
    163           "#digg_count_9465698",
    164           "#post_list > div.post_item:nth-child(13) > div.digg > div.clear",
    165           "#digg_tip_9465698",
    166           "#post_list > div.post_item:nth-child(13) > div.post_item_body > h3 > a.titlelnk",
    167           "#post_list > div.post_item:nth-child(13) > div.post_item_body > p.post_item_summary > a > img.pfs",
    168           "#post_list > div.post_item:nth-child(13) > div.post_item_body > div.post_item_foot > a.lightblue",
    169           "#post_list > div.post_item:nth-child(13) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
    170           "#post_list > div.post_item:nth-child(13) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
    171           "#post_list > div.post_item:nth-child(13) > div.clear"
    172         ],
    173         "#post_list > div.post_item:nth-child(14)": [
    174           "#digg_count_9498410",
    175           "#post_list > div.post_item:nth-child(14) > div.digg > div.clear",
    176           "#digg_tip_9498410",
    177           "#post_list > div.post_item:nth-child(14) > div.post_item_body > h3 > a.titlelnk",
    178           "#post_list > div.post_item:nth-child(14) > div.post_item_body > p.post_item_summary > a > img.pfs",
    179           "#post_list > div.post_item:nth-child(14) > div.post_item_body > div.post_item_foot > a.lightblue",
    180           "#post_list > div.post_item:nth-child(14) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
    181           "#post_list > div.post_item:nth-child(14) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
    182           "#post_list > div.post_item:nth-child(14) > div.clear"
    183         ],
    184         "#post_list > div.post_item:nth-child(11)": [
    185           "#digg_count_9500633",
    186           "#post_list > div.post_item:nth-child(11) > div.digg > div.clear",
    187           "#digg_tip_9500633",
    188           "#post_list > div.post_item:nth-child(11) > div.post_item_body > h3 > a.titlelnk",
    189           "#post_list > div.post_item:nth-child(11) > div.post_item_body > p.post_item_summary",
    190           "#post_list > div.post_item:nth-child(11) > div.post_item_body > div.post_item_foot > a.lightblue",
    191           "#post_list > div.post_item:nth-child(11) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
    192           "#post_list > div.post_item:nth-child(11) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
    193           "#post_list > div.post_item:nth-child(11) > div.clear"
    194         ],
    195         "#post_list > div.post_item:nth-child(12)": [
    196           "#digg_count_9500352",
    197           "#post_list > div.post_item:nth-child(12) > div.digg > div.clear",
    198           "#digg_tip_9500352",
    199           "#post_list > div.post_item:nth-child(12) > div.post_item_body > h3 > a.titlelnk",
    200           "#post_list > div.post_item:nth-child(12) > div.post_item_body > p.post_item_summary > a > img.pfs",
    201           "#post_list > div.post_item:nth-child(12) > div.post_item_body > div.post_item_foot > a.lightblue",
    202           "#post_list > div.post_item:nth-child(12) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
    203           "#post_list > div.post_item:nth-child(12) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
    204           "#post_list > div.post_item:nth-child(12) > div.clear"
    205         ],
    206         "#post_list > div.post_item:nth-child(20)": [
    207           "#digg_count_9499225",
    208           "#post_list > div.post_item:nth-child(20) > div.digg > div.clear",
    209           "#digg_tip_9499225",
    210           "#post_list > div.post_item:nth-child(20) > div.post_item_body > h3 > a.titlelnk",
    211           "#post_list > div.post_item:nth-child(20) > div.post_item_body > p.post_item_summary > a > img.pfs",
    212           "#post_list > div.post_item:nth-child(20) > div.post_item_body > div.post_item_foot > a.lightblue",
    213           "#post_list > div.post_item:nth-child(20) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
    214           "#post_list > div.post_item:nth-child(20) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
    215           "#post_list > div.post_item:nth-child(20) > div.clear"
    216         ],
    217         "#post_list > div.post_item:nth-child(10)": [
    218           "#digg_count_9500632",
    219           "#post_list > div.post_item:nth-child(10) > div.digg > div.clear",
    220           "#digg_tip_9500632",
    221           "#post_list > div.post_item:nth-child(10) > div.post_item_body > h3 > a.titlelnk",
    222           "#post_list > div.post_item:nth-child(10) > div.post_item_body > p.post_item_summary > a > img.pfs",
    223           "#post_list > div.post_item:nth-child(10) > div.post_item_body > div.post_item_foot > a.lightblue",
    224           "#post_list > div.post_item:nth-child(10) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
    225           "#post_list > div.post_item:nth-child(10) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
    226           "#post_list > div.post_item:nth-child(10) > div.clear"
    227         ]
    228       },
    229       "scroce": "9860"
    230     },
    231     {
    232       "list_sel": "#cate_item",
    233       "list_dom": {
    234         "#cate_item_108705": [
    235           "#cate_item_108705 > a"
    236         ],
    237         "#cate_item_108704": [
    238           "#cate_item_108704 > a"
    239         ],
    240         "#cate_item_108703": [
    241           "#cate_item_108703 > a"
    242         ],
    243         "#cate_item_4": [
    244           "#cate_item_4 > a"
    245         ],
    246         "#cate_item_2": [
    247           "#cate_item_2 > a"
    248         ],
    249         "#cate_item_108709": [
    250           "#cate_item_108709 > a"
    251         ],
    252         "#cate_item_0": [
    253           "#cate_item_0 > a"
    254         ],
    255         "#cate_item_108698": [
    256           "#cate_item_108698 > a"
    257         ],
    258         "#cate_item_108724": [
    259           "#cate_item_108724 > a"
    260         ],
    261         "#cate_item_108701": [
    262           "#cate_item_108701 > a"
    263         ],
    264         "#cate_item_108712": [
    265           "#cate_item_108712 > a"
    266         ],
    267         "#cate_item_-1": [
    268           "#cate_item_-1 > a"
    269         ]
    270       },
    271       "scroce": "1248"
    272     },
    273     {
    274       "list_sel": "#friend_link",
    275       "list_dom": {
    276         "#friend_link > a:nth-child(15)": [
    277           "#friend_link > a:nth-child(15)"
    278         ],
    279         "#friend_link > a:nth-child(16)": [
    280           "#friend_link > a:nth-child(16)"
    281         ],
    282         "#friend_link > a:nth-child(17)": [
    283           "#friend_link > a:nth-child(17)"
    284         ],
    285         "#friend_link > a:nth-child(18)": [
    286           "#friend_link > a:nth-child(18)"
    287         ],
    288         "#friend_link > a:nth-child(1)": [
    289           "#friend_link > a:nth-child(1)"
    290         ],
    291         "#friend_link > a:nth-child(11)": [
    292           "#friend_link > a:nth-child(11)"
    293         ],
    294         "#friend_link > a:nth-child(12)": [
    295           "#friend_link > a:nth-child(12)"
    296         ],
    297         "#friend_link > a:nth-child(3)": [
    298           "#friend_link > a:nth-child(3)"
    299         ],
    300         "#friend_link > a:nth-child(13)": [
    301           "#friend_link > a:nth-child(13)"
    302         ],
    303         "#friend_link > a:nth-child(2)": [
    304           "#friend_link > a:nth-child(2)"
    305         ],
    306         "#friend_link > a:nth-child(14)": [
    307           "#friend_link > a:nth-child(14)"
    308         ],
    309         "#friend_link > a:nth-child(19)": [
    310           "#friend_link > a:nth-child(19)"
    311         ],
    312         "#friend_link > a:nth-child(5)": [
    313           "#friend_link > a:nth-child(5)"
    314         ],
    315         "#friend_link > a:nth-child(4)": [
    316           "#friend_link > a:nth-child(4)"
    317         ],
    318         "#friend_link > a:nth-child(7)": [
    319           "#friend_link > a:nth-child(7)"
    320         ],
    321         "#friend_link > a:nth-child(6)": [
    322           "#friend_link > a:nth-child(6)"
    323         ],
    324         "#friend_link > a:nth-child(10)": [
    325           "#friend_link > a:nth-child(10)"
    326         ],
    327         "#friend_link > a:nth-child(9)": [
    328           "#friend_link > a:nth-child(9)"
    329         ],
    330         "#friend_link > a:nth-child(8)": [
    331           "#friend_link > a:nth-child(8)"
    332         ]
    333       },
    334       "scroce": "1197"
    335     },
    336     {
    337       "list_sel": "#side_nav",
    338       "list_dom": {
    339         "#side_nav > div.w_l:nth-child(16)": [
    340           "#side_nav > div.w_l:nth-child(16) > h4",
    341           "#site_stats"
    342         ],
    343         "#side_nav > p.r_l_1:nth-child(7)": [
    344           "#side_nav > p.r_l_1:nth-child(7)"
    345         ],
    346         "#side_nav > p.r_l_2:nth-child(8)": [
    347           "#side_nav > p.r_l_2:nth-child(8)"
    348         ],
    349         "#side_nav > p.r_l_3:nth-child(9)": [
    350           "#side_nav > p.r_l_3:nth-child(9)"
    351         ],
    352         "#side_nav > p.r_l_1:nth-child(5)": [
    353           "#side_nav > p.r_l_1:nth-child(5)"
    354         ],
    355         "#side_nav > p.r_l_3:nth-child(13)": [
    356           "#side_nav > p.r_l_3:nth-child(13)"
    357         ],
    358         "#side_nav > p.r_l_2:nth-child(4)": [
    359           "#side_nav > p.r_l_2:nth-child(4)"
    360         ],
    361         "#side_nav > p.r_l_3:nth-child(19)": [
    362           "#side_nav > p.r_l_3:nth-child(19)"
    363         ],
    364         "#side_nav > p.r_l_3:nth-child(3)": [
    365           "#side_nav > p.r_l_3:nth-child(3)"
    366         ],
    367         "#side_nav > div.w_l:nth-child(6)": [
    368           "#side_nav > div.w_l:nth-child(6) > h4",
    369           "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(1) > a",
    370           "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(2) > a",
    371           "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(3) > a",
    372           "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(4) > a",
    373           "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(5) > a",
    374           "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(6) > a"
    375         ],
    376         "#side_nav > p.r_l_2:nth-child(18)": [
    377           "#side_nav > p.r_l_2:nth-child(18)"
    378         ],
    379         "#side_nav > div.l_s:nth-child(12)": [
    380           "#side_nav > div.l_s:nth-child(12)"
    381         ],
    382         "#cate_title_block": [
    383           "#cate_title_title > div.cate_title",
    384           "#cate_item_108698 > a",
    385           "#cate_item_2 > a",
    386           "#cate_item_108701 > a",
    387           "#cate_item_108703 > a",
    388           "#cate_item_108704 > a",
    389           "#cate_item_108705 > a",
    390           "#cate_item_108709 > a",
    391           "#cate_item_108712 > a",
    392           "#cate_item_108724 > a",
    393           "#cate_item_4 > a",
    394           "#cate_item_0 > a",
    395           "#cate_item_-1 > a",
    396           "#cate_title_block > div.cate_bottom",
    397           "#cate_sub_block",
    398           "#cate_title_block > script"
    399         ],
    400         "#side_nav > div.l_s:nth-child(2)": [
    401           "#side_nav > div.l_s:nth-child(2)"
    402         ],
    403         "#side_nav > p.r_l_1:nth-child(17)": [
    404           "#side_nav > p.r_l_1:nth-child(17)"
    405         ],
    406         "#side_nav > p.r_l_2:nth-child(14)": [
    407           "#side_nav > p.r_l_2:nth-child(14)"
    408         ],
    409         "#side_nav > p.r_l_1:nth-child(15)": [
    410           "#side_nav > p.r_l_1:nth-child(15)"
    411         ],
    412         "#user_stats": [
    413           "#user_stats"
    414         ],
    415         "#side_nav > div.l_s:nth-child(10)": [
    416           "#side_nav > div.l_s:nth-child(10)"
    417         ]
    418       },
    419       "scroce": "975"
    420     },
    421     {
    422       "list_sel": "#paging_block > div.pager",
    423       "list_dom": {
    424         "#paging_block > div.pager > a.p_9.middle": [
    425           "#paging_block > div.pager > a.p_9.middle"
    426         ],
    427         "#paging_block > div.pager > a.p_7.middle": [
    428           "#paging_block > div.pager > a.p_7.middle"
    429         ],
    430         "#paging_block > div.pager > a.p_8.middle": [
    431           "#paging_block > div.pager > a.p_8.middle"
    432         ],
    433         "#paging_block > div.pager > a:nth-child(14)": [
    434           "#paging_block > div.pager > a:nth-child(14)"
    435         ],
    436         "#paging_block > div.pager > a.p_11.middle": [
    437           "#paging_block > div.pager > a.p_11.middle"
    438         ],
    439         "#paging_block > div.pager > a.p_3.middle": [
    440           "#paging_block > div.pager > a.p_3.middle"
    441         ],
    442         "#paging_block > div.pager > a.p_4.middle": [
    443           "#paging_block > div.pager > a.p_4.middle"
    444         ],
    445         "#paging_block > div.pager > a.p_10.middle": [
    446           "#paging_block > div.pager > a.p_10.middle"
    447         ],
    448         "#paging_block > div.pager > a.p_2.middle": [
    449           "#paging_block > div.pager > a.p_2.middle"
    450         ],
    451         "#paging_block > div.pager > a.p_5.middle": [
    452           "#paging_block > div.pager > a.p_5.middle"
    453         ],
    454         "#paging_block > div.pager > a.p_6.middle": [
    455           "#paging_block > div.pager > a.p_6.middle"
    456         ],
    457         "#paging_block > div.pager > a.p_1.current": [
    458           "#paging_block > div.pager > a.p_1.current"
    459         ],
    460         "#paging_block > div.pager > span.ellipsis": [
    461           "#paging_block > div.pager > span.ellipsis"
    462         ],
    463         "#paging_block > div.pager > a.p_200.last": [
    464           "#paging_block > div.pager > a.p_200.last"
    465         ]
    466       },
    467       "scroce": "865"
    468     },
    469     {
    470       "list_sel": "#main > div.post_nav_block_wrapper > ul.post_nav_block",
    471       "list_dom": {
    472         "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(1)": [
    473           "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(1) > a.current_nav"
    474         ],
    475         "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(3)": [
    476           "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(3) > a"
    477         ],
    478         "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(2)": [
    479           "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(2) > a"
    480         ],
    481         "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(5)": [
    482           "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(5) > a"
    483         ],
    484         "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(4)": [
    485           "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(4) > a"
    486         ],
    487         "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(7)": [
    488           "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(7) > a"
    489         ],
    490         "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(6)": [
    491           "#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(6) > a"
    492         ]
    493       },
    494       "scroce": "590"
    495     },
    496     {
    497       "list_sel": "#nav_menu",
    498       "list_dom": {
    499         "#nav_menu > a:nth-child(3)": [
    500           "#nav_menu > a:nth-child(3)"
    501         ],
    502         "#nav_menu > a:nth-child(2)": [
    503           "#nav_menu > a:nth-child(2)"
    504         ],
    505         "#nav_menu > a:nth-child(5)": [
    506           "#nav_menu > a:nth-child(5)"
    507         ],
    508         "#nav_menu > a:nth-child(4)": [
    509           "#nav_menu > a:nth-child(4)"
    510         ],
    511         "#nav_menu > a:nth-child(1)": [
    512           "#nav_menu > a:nth-child(1)"
    513         ],
    514         "#nav_menu > a:nth-child(7)": [
    515           "#nav_menu > a:nth-child(7)"
    516         ],
    517         "#nav_menu > a:nth-child(6)": [
    518           "#nav_menu > a:nth-child(6)"
    519         ],
    520         "#nav_menu > a:nth-child(9)": [
    521           "#nav_menu > a:nth-child(9)"
    522         ],
    523         "#nav_menu > a:nth-child(8)": [
    524           "#nav_menu > a:nth-child(8)"
    525         ]
    526       },
    527       "scroce": "486"
    528     },
    529     {
    530       "list_sel": "#side_nav > div.w_l:nth-child(6) > ul",
    531       "list_dom": {
    532         "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(3)": [
    533           "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(3) > a"
    534         ],
    535         "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(2)": [
    536           "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(2) > a"
    537         ],
    538         "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(1)": [
    539           "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(1) > a"
    540         ],
    541         "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(6)": [
    542           "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(6) > a"
    543         ],
    544         "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(5)": [
    545           "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(5) > a"
    546         ],
    547         "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(4)": [
    548           "#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(4) > a"
    549         ]
    550       },
    551       "scroce": "486"
    552     },
    553     {
    554       "list_sel": "#headline_block > ul",
    555       "list_dom": {
    556         "#headline_block > ul > li:nth-child(4)": [
    557           "#headline_block > ul > li:nth-child(4) > a:nth-child(1)",
    558           "#headline_block > ul > li:nth-child(4) > a.right_more"
    559         ],
    560         "#headline_block > ul > li.editor_pick": [
    561           "#editor_pick_count",
    562           "#headline_block > ul > li.editor_pick > a.right_more"
    563         ],
    564         "#headline_block > ul > li:nth-child(3)": [
    565           "#headline_block > ul > li:nth-child(3) > a:nth-child(1)",
    566           "#headline_block > ul > li:nth-child(3) > a.right_more"
    567         ],
    568         "#headline_block > ul > li:nth-child(2)": [
    569           "#headline_block > ul > li:nth-child(2) > a:nth-child(1)",
    570           "#headline_block > ul > li:nth-child(2) > a.right_more"
    571         ]
    572       },
    573       "scroce": "407"
    574     },
    575     {
    576       "list_sel": "#header",
    577       "list_dom": {
    578         "#header > p.h_r_3:nth-child(1)": [
    579           "#header > p.h_r_3:nth-child(1)"
    580         ],
    581         "#header > p.h_r_2:nth-child(6)": [
    582           "#header > p.h_r_2:nth-child(6)"
    583         ],
    584         "#header > p.h_r_1:nth-child(3)": [
    585           "#header > p.h_r_1:nth-child(3)"
    586         ],
    587         "#header > p.h_r_2:nth-child(2)": [
    588           "#header > p.h_r_2:nth-child(2)"
    589         ],
    590         "#header > p.h_r_1:nth-child(5)": [
    591           "#header > p.h_r_1:nth-child(5)"
    592         ],
    593         "#header > p.h_r_3:nth-child(7)": [
    594           "#header > p.h_r_3:nth-child(7)"
    595         ],
    596         "#header_block": [
    597           "#logo > h1 > a > img",
    598           "#header_block > div.clear"
    599         ]
    600       },
    601       "scroce": "335"
    602     }
    603   ],
    604   "ifrs": []
    605 }
    View Code

    10.上述第8步标记文件效果:

       红色虚线框起来的是返回的json结果中list中的list_sel选择器选中的元素

    分析结果统计:

    处理了将近1万的网站发现,大致的网页列表结构可以发现,平时时间大致在2-3s左右,因为用的是jsoup访问的网页,包含了网页响应的时间,时间复杂度待优化,

    分析结果对于一些比较复杂乱的网页支持有待加强,代码写的比较乱,有待优化,应该会有更好的处理方式,还请指教,相互学习交流。

    转载请注明出处:https://www.cnblogs.com/jstarseven/p/9501210.html

    源码地址:https://github.com/jstarseven/list-autofire

      


     -END-

  • 相关阅读:
    [JXOI2018]游戏
    UltraISO制作启动盘安装CentOS7
    龙芯、鲲鹏、飞腾等国产平台的浏览器
    浏览器相关性能测试网址
    有没有一种组合字体,中文是宋体,英文是times new roman?
    中国航天科工集团第二研究院706所招聘
    华为云
    面试阿里P6难在哪?(面试难点)
    Linux下配置环境变量—— .bashrc 和 /etc/profile
    Linux 内存 占用较高问题排查
  • 原文地址:https://www.cnblogs.com/jstarseven/p/9501210.html
Copyright © 2011-2022 走看看