zoukankan      html  css  js  c++  java
  • 爬虫acm比赛成绩(多页成绩整合在一起、获取复制不了的数据)(hihocoder、计蒜客)

    https://github.com/congmingyige/web-crawler_rank-of-competition-in-JiSuanKe-and-hihocoder

    1. 计蒜客(获取复制不了的数据)

      1 import java.util.Scanner;
      2 
      3 /**
      4  * 无法从网页上获得源代码
      5  */
      6 
      7 public class GetScore_jisuanke {
      8     
      9     static String PREFIX_UNICODE= "\u";
     10     static char ascii2Char(String str) { 
     11         if (str.length() != 6) { 
     12             throw new IllegalArgumentException("Ascii string of a native character must be 6 character."); 
     13         } 
     14         if (!PREFIX_UNICODE.equals(str.substring(0, 2))) { 
     15             throw new IllegalArgumentException("Ascii string of a native character must start with "\u"."); 
     16         } 
     17         String tmp = str.substring(2, 4); // 将十六进制转为十进制 
     18         int code = Integer.parseInt(tmp, 16) << 8; // 转为高位,后与地位相加 
     19         tmp = str.substring(4, 6); 
     20         code += Integer.parseInt(tmp, 16); // 与低8为相加 
     21         return (char) code; 
     22     } 
     23     
     24     static String ascii2Native(String str) { 
     25         StringBuilder sb = new StringBuilder(); 
     26         int begin = 0; 
     27         int index = str.indexOf(PREFIX_UNICODE); 
     28         while (index != -1) { 
     29             sb.append(str.substring(begin, index)); 
     30             sb.append(ascii2Char(str.substring(index, index + 6))); 
     31             begin = index + 6; index = str.indexOf(PREFIX_UNICODE, begin); 
     32         } 
     33         sb.append(str.substring(begin)); 
     34         return sb.toString(); 
     35     }
     36 
     37     /*
     38      * unicode代码  来自 黑暗的笑 的CSDN 博客 ,全文地址请点击:https://blog.csdn.net/xia744510124/article/details/51322107?utm_source=copy 
     39      */
     40     
     41     public static void main(String[] args) {
     42         Scanner in=new Scanner(System.in);
     43         String str,s;
     44         int s1,s2,s3;
     45         String tag=new String("</script>");
     46         int x,y,sum_pro,i;
     47         
     48         while ((str=in.nextLine())!=null) {
     49             if (str.length()>=9 && str.substring(0,9).equals(tag)) {
     50 
     51                 s="problem_naming";
     52                 x=str.indexOf(s);
     53                 x+=s.length()+3;
     54                 y=str.indexOf("]",x);
     55                 sum_pro=(y-x)/4;
     56 
     57                 System.out.print("team	school	count	time	");
     58                 for (i=0;i<sum_pro;i++)
     59                     System.out.print((char)(65+i)+"	");
     60                 System.out.println();
     61                 
     62                 y=str.indexOf("prev_page_url",y);
     63                 
     64                 while (true) {
     65                     s="name";
     66                     x=str.indexOf(s,y);
     67                     if (x==-1)
     68                         break;
     69                     x+=s.length()+3;
     70                     y=str.indexOf(""",x);
     71                     System.out.print(str.substring(x,y)+"	");
     72                     
     73                     s="school";
     74                     x=str.indexOf(s,y);
     75                     x+=s.length()+3;
     76                     y=str.indexOf(""",x);
     77                     System.out.print(ascii2Native(str.substring(x,y))+"	");
     78                     
     79                     s="score";
     80                     x=str.indexOf(s,y);
     81                     x+=s.length()+2;
     82                     y=str.indexOf(",",x);
     83                     System.out.print(str.substring(x,y)+"	");
     84 
     85                     s="cost";
     86                     x=str.indexOf(s,y);
     87                     x+=s.length()+2;
     88                     y=str.indexOf(",",x);
     89                     System.out.print(str.substring(x,y)+"	");
     90                     
     91                     // until not exists or ==cost  -1
     92                     for (i=1;i<=sum_pro;i++) {
     93                         //cost":120,"exact_cost":7144,"submit_count":4,"problem_score":1,"score":0
     94                         s="cost"";
     95                         x=str.indexOf(s,y);
     96                         x+=s.length()+1;    //2-1
     97                         y=str.indexOf(",",x);
     98                         s1=Integer.valueOf(str.substring(x,y));
     99 
    100                         s="exact_cost";
    101                         x=str.indexOf(s,y);
    102                         x+=s.length()+2;
    103                         y=str.indexOf(",",x);
    104                         s2=Integer.valueOf(str.substring(x,y));
    105                         
    106                         s="submit_count";
    107                         x=str.indexOf(s,y);
    108                         x+=s.length()+2;
    109                         y=str.indexOf(",",x);
    110                         s3=Integer.valueOf(str.substring(x,y));
    111                         
    112                         if (s2!=0)
    113                             System.out.print(s1);
    114                         else
    115                             System.out.print("——");
    116                         System.out.print("("+s3+")	");
    117                     }
    118                     System.out.println();
    119                 }
    120             }
    121         }
    122     }
    123 }

    效果:

    2. hihocoder(多页成绩整合在一起)

      1 /**
      2  * get source code:
      3  * https://www.cnblogs.com/chaohu13/p/5337498.html
      4  */
      5 import java.io.BufferedReader;
      6 import java.io.InputStreamReader;
      7 import java.net.HttpURLConnection;
      8 import java.net.URL;
      9 
     10 public class GetScore_hiho {
     11     public static void main(String args[]){    
     12         URL url;
     13         int responsecode;
     14         HttpURLConnection urlConnection;
     15         BufferedReader reader;
     16         String str,str1;
     17         String tag=new String("<tr class="std-acm">");
     18         String website;
     19     //修改1 必须是"rank?page="形式
     20         website=new String("http://hihocoder.com/contest/acmicpc2018beijingonline/rank?page=1");
     21         int x,y,i;
     22     //修改2
     23         int page=13;
     24     int index=0;
     25         Boolean vis;
     26         
     27         vis=false;    //首栏只用存在一次
     28         for (index=1;index<=page;index++) {
     29             try{
     30                 //生成一个URL对象,要获取源代码的网页地址为:http://www.sina.com.cn
     31                 x=website.indexOf("=");
     32                 website=website.substring(0,x+1)+String.valueOf(index);
     33                 url=new URL(website);
     34                 
     35                 //打开URL
     36                 urlConnection = (HttpURLConnection)url.openConnection();
     37                 //获取服务器响应代码
     38                 responsecode=urlConnection.getResponseCode();
     39                 if(responsecode==200){
     40                     //得到输入流,即获得了网页的内容
     41                     reader=new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"UTF-8"));//GBK
     42                     while((str=reader.readLine().trim())!=null){
     43     //                    System.out.println(str);    //test
     44                         if (str.equals(tag)==true) {
     45                             str=reader.readLine().trim();
     46                             x=str.indexOf(">");
     47                             y=str.indexOf("<",x);
     48 
     49                             if (str.substring(x+1,y).equals("Rank")==true) {
     50                                 if (vis==false) {
     51                                     vis=true;
     52                                     System.out.print(str.substring(x+1,y).trim()+"	");
     53                                     while (true) {
     54                                         str=reader.readLine().trim();
     55                                         if (str.equals("</tr>")==true)
     56                                             break;
     57                                         x=str.indexOf(">");
     58                                         y=str.indexOf("<",x);
     59                                         System.out.print(str.substring(x+1,y).trim()+"	");
     60                                         
     61                                         if ((x=str.indexOf(">",y))!=str.length()-1) {
     62                                             y=str.indexOf("<",x);
     63                                             System.out.print(str.substring(x+1,y).trim()+"	");
     64                                         }
     65                                     }
     66                                     System.out.println();
     67                                 }
     68 //                                System.exit(0);    //test
     69                             }
     70                             else {
     71                                 /*
     72                                  * <td>1</td>
     73                                  * <td>清华大学</td>
     74                                  */
     75                                 System.out.print(str.substring(x+1,y).trim()+"	");
     76                                 for (i=2;i<=2;i++) {    //1+1
     77                                     str=reader.readLine().trim();
     78                                     x=str.indexOf(">");
     79                                     y=str.indexOf("<",x);
     80                                     System.out.print(str.substring(x+1,y).trim()+"	");    
     81                                 }
     82         
     83                                 //<td><a class="fn-ell" style="display: block;" href="/user/109506">team181814</a></td>
     84                                 str=reader.readLine().trim();
     85                                 x=str.indexOf(">",5);
     86                                 y=str.indexOf("<",x);
     87                                 System.out.print(str.substring(x+1,y).trim()+"	");    
     88         
     89                                 /*
     90                                  * <td class="solved">8</td>
     91                                  * <td>15:20:09</td>
     92                                  */
     93                                 for (i=1;i<=2;i++) {
     94                                     str=reader.readLine().trim();
     95                                     x=str.indexOf(">");
     96                                     y=str.indexOf("<",x);
     97                                     System.out.print(str.substring(x+1,y).trim()+"	");    
     98                                 }
     99                                 
    100                                 while (true) {
    101                                     str=reader.readLine().trim();
    102                                     if (str.equals("</tr>")==true)
    103                                         break;
    104                                     str=reader.readLine().trim();
    105                                     str=reader.readLine().trim();
    106                                     if (str.equals("</td>")==true)
    107                                         str="";
    108                                     else if (str.charAt(0)>='0' && str.charAt(0)<='9') {
    109                                         x=str.indexOf("<br>");
    110                                         if (x!=-1) {
    111                                             y=str.indexOf(")",x+4);
    112                                             str=str.substring(0,7)+" "+str.substring(x+4,y+1);
    113                                             str1=reader.readLine();    //读多一行
    114                                         }
    115                                         else
    116                                             str=str.substring(0,7);
    117                                     }
    118                                     else {
    119                                         x=str.indexOf(")");
    120                                         str=str.substring(0,x+1);
    121                                         str1=reader.readLine();    //读多一行
    122                                     }
    123                                     System.out.print(str+"	");
    124                                 }
    125                                 System.out.println();
    126                             }
    127 //                            System.exit(0);    //test
    128                         }
    129                     }
    130                 }
    131                 else{
    132                     System.out.println("获取不到网页的源码,服务器响应代码为:"+responsecode);
    133                 }
    134             }
    135             catch(Exception e){
    136                 //End Of Input
    137     //            System.out.println("获取不到网页的源码,出现异常:"+e);
    138             }
    139         }
    140         
    141 
    142     }
    143 }
    144 /*
    145                         p=Pattern.compile("<td>|</td>");
    146                         m=p.matcher(str);
    147                         str=m.replaceAll("");
    148                         System.out.print(str+"	");
    149 */

    效果:

  • 相关阅读:
    SpringBoot-配置Druid-yml方式
    CentOS7下配置Nginx并实现简单的负载均衡
    用私有构造器或者枚举类型强化Singleton
    virtualenv虚拟环境的使用
    Windows平台安装Python
    window平台基于influxdb + grafana + jmeter 搭建性能测试实时监控平台
    easy-mock本地部署成功,访问报错:EADDRNOTAVAIL 0.0.0.0:7300 解决方案
    npm install 报错: WARN checkPermissions Missing write access to 解决方案
    npm install 报错:ERR! code EINTEGRITY 解决方案
    最新版chrome浏览器如何离线安装crx插件?(转载)
  • 原文地址:https://www.cnblogs.com/cmyg/p/9695649.html
Copyright © 2011-2022 走看看