zoukankan      html  css  js  c++  java
  • Java---网络蜘蛛-网页邮箱抓取器~源码

    刚刚学完Socket,迫不及待的做了这个网页邮箱抓取~~~

    现在有越来越多的人热衷于做网络爬虫(网络蜘蛛),也有越来越多的地方需要网络爬虫,比如搜索引擎、资讯采集、舆情监测等等,诸如此类。网络爬虫涉及到的技术(算法/策略)广而复杂,如网页获取、网页跟踪、网页分析、网页搜索、网页评级和结构/非结构化数据抽取以及后期更细粒度的数据挖掘等方方面面,对于新手来说,不是一朝一夕便能完全掌握且熟练应用的,对于作者来说,更无法在一篇文章内就将其说清楚。因此在本篇文章中,我们仅将视线聚焦在网络爬虫的最基础技术——网页抓取方面。

    说到网页抓取,往往有两个点是不得不说的,首先是网页编码的识别,另外一个是对网页脚本运行的支持,除此之外,是否支持以POST方式提交请求和支持自动的cookie管理也是很多人所关注的重要方面。其实Java世界里,已经有很多开源的组件来支持各种各样方式的网页抓取了,包括上面提到的四个重点,所以说使用Java做网页抓取还是比较容易的。下面,作者将重点介绍其中的六种方式。


    自己以前做过微商,而且还掏钱买过抓取网络邮箱的软件~现在O(∩_∩)O哈哈~我自己做~当然啦,没有别人做得好~只是功能还是差不多啦~

    给一个带协议的网站~然后深入网页中查找邮箱~

    因为博主知识有限~线程池目前还没有学~导致无法控制线程~~~见谅~
    还有~就是没有设置停止按钮~也是因为没学线程池~水平不够啊~
    只能关闭软件来停止程序~

    package cn.hncu.bs;
    
    import java.io.BufferedOutputStream;
    import java.io.BufferedReader;
    import java.io.DataOutputStream;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import javax.swing.JOptionPane;
    
    /**
     * 
     * @author 陈浩翔
     * @version 1.0  2016-5-12
     */
    public class SpiderUi extends javax.swing.JFrame {
    
        public SpiderUi() {
            super("网络蜘蛛1.0-陈浩翔版权所有!");
            initComponents();
    
        }
    
        private void initComponents() {
    
            jLabel1 = new javax.swing.JLabel();
            jLabel2 = new javax.swing.JLabel();
            tfdUrl = new javax.swing.JTextField();
            jLabel3 = new javax.swing.JLabel();
            tfdTime = new javax.swing.JTextField();
            jLabel4 = new javax.swing.JLabel();
            btnRun = new javax.swing.JButton();
            jButton1 = new javax.swing.JButton();
    
            setDefaultCloseOperation(javax.swing.WindowConstants.EXIT_ON_CLOSE);
            setMinimumSize(new java.awt.Dimension(400, 400));
            getContentPane().setLayout(null);
    
            jLabel1.setFont(new java.awt.Font("Dialog", 1, 24));
            jLabel1.setForeground(new java.awt.Color(255, 0, 51));
            jLabel1.setText("u7f51u7edcu8718u86db-u7f51u9875u90aeu7bb1u6293u53d6u56681.0");
            getContentPane().add(jLabel1);
            jLabel1.setBounds(30, 20, 350, 70);
    
            jLabel2.setFont(new java.awt.Font("Dialog", 1, 14));
            jLabel2.setText("u9012u5f52u6df1u5165u5c42u6570:");
            getContentPane().add(jLabel2);
            jLabel2.setBounds(20, 190, 110, 30);
    
            tfdUrl.setFont(new java.awt.Font("Dialog", 1, 12));
            getContentPane().add(tfdUrl);
            tfdUrl.setBounds(20, 140, 350, 30);
    
            jLabel3.setFont(new java.awt.Font("Dialog", 1, 14));
            jLabel3.setText("u8d77u59cbURL:");
            getContentPane().add(jLabel3);
            jLabel3.setBounds(20, 100, 70, 30);
    
            tfdTime.setFont(new java.awt.Font("Dialog", 1, 14));
            getContentPane().add(tfdTime);
            tfdTime.setBounds(20, 230, 60, 30);
    
            jLabel4.setFont(new java.awt.Font("Dialog", 0, 11));
            jLabel4.setText("u5373u641cu7d22u7f51u9875u90aeu7bb1u65f6,u641cu7d22u6df1u5165u7684u5c42u6570,u5efau8bae200u5de6u53f3");
            getContentPane().add(jLabel4);
            jLabel4.setBounds(90, 230, 250, 30);
    
            btnRun.setFont(new java.awt.Font("Dialog", 1, 18));
            btnRun.setForeground(new java.awt.Color(0, 51, 255));
            btnRun.setText("u5f00u59cbu6293u53d6");
            btnRun.addActionListener(new java.awt.event.ActionListener() {
                public void actionPerformed(java.awt.event.ActionEvent evt) {
                    btnRunActionPerformed(evt);
                }
            });
            getContentPane().add(btnRun);
            btnRun.setBounds(40, 300, 110, 50);
    
            jButton1.setFont(new java.awt.Font("Dialog", 1, 18));
            jButton1.setForeground(new java.awt.Color(0, 51, 255));
            jButton1.setText("u5e2eu52a9");
            jButton1.addActionListener(new java.awt.event.ActionListener() {
                public void actionPerformed(java.awt.event.ActionEvent evt) {
                    jButton1ActionPerformed(evt);
                }
            });
            getContentPane().add(jButton1);
            jButton1.setBounds(230, 300, 120, 50);
    
    
        }
    
        private void jButton1ActionPerformed(java.awt.event.ActionEvent evt) {
            JOptionPane.showMessageDialog(this, "抓取的邮箱存储在D:\net\mail.txt文件中
    URL存储在D:\net\http.txt文件中");
        }
    
        private void btnRunActionPerformed(java.awt.event.ActionEvent evt) {
            int time;
            try {
                time = Integer.parseInt(tfdTime.getText());
            } catch (NumberFormatException e1) {
                JOptionPane.showMessageDialog(this, "输入的层数格式错误!应该为整数!");
                return;
            }
    
            try {
                String inet = tfdUrl.getText();
                URL url = new URL(inet);
                //System.out.println(url.getHost());
                File file = new File("D://net");
                if (!file.exists()) {
                    file.mkdir();
                }
    
                DataOutputStream dout = new DataOutputStream(
                        new BufferedOutputStream(new FileOutputStream(
                                "D:\net\mail.txt", true)));
                DataOutputStream doutHttp = new DataOutputStream(
                        new BufferedOutputStream(new FileOutputStream(
                                "D:\net\http.txt", true)));
                new Thread(new RunThread(url, time, dout, doutHttp)).start();
                //System.out.println("一个线程读取完!");
            } catch (MalformedURLException e) {
                JOptionPane.showMessageDialog(this, "请输入正确的URL地址!!");
                return;
            } catch (IOException e) {
                JOptionPane.showMessageDialog(this, "请输入正确的URL地址!!");
                return;
            }
        }
    
        public static void main(String args[]) {
            java.awt.EventQueue.invokeLater(new Runnable() {
                public void run() {
                    new SpiderUi().setVisible(true);
                }
            });
        }
    
        private javax.swing.JButton btnRun;
        private javax.swing.JButton jButton1;
        private javax.swing.JLabel jLabel1;
        private javax.swing.JLabel jLabel2;
        private javax.swing.JLabel jLabel3;
        private javax.swing.JLabel jLabel4;
        private javax.swing.JTextField tfdTime;
        private javax.swing.JTextField tfdUrl;
    }
    
    class RunThread implements Runnable {
        private URL url = null;
        private int time = 0;
        private DataOutputStream dout = null;
        DataOutputStream doutHttp = null;
    
    
        public RunThread() {
        }
    
        public RunThread(URL url, int time, DataOutputStream dout,
                DataOutputStream doutHttp) {
            this.url = url;
            this.time = time;
            this.dout = dout;
            this.doutHttp = doutHttp;
        }
    
        @Override
        public void run() {
            try {
                if (time == 0) {
                    return;
                }
                BufferedReader br = new BufferedReader(new InputStreamReader(
    
                url.openStream()));
                String regex = "\w+@\w+(\.\w+)+";
    
                Pattern p = Pattern.compile(regex);
    
                Pattern pUrl = Pattern
                        .compile("http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
    
                String line = null;
                while ((line = br.readLine()) != null) {
                    Matcher m = p.matcher(line);
    
                    Matcher mUrl = pUrl.matcher(line);
                    while (mUrl.find()) {
                        try {
                            BufferedReader br2 = new BufferedReader(
                                    new InputStreamReader(new FileInputStream(
                                            "http.txt")));
                            String s = null;
                            boolean is = false;
                            while ((s = br2.readLine()) != null) {
                                if (s.equals(mUrl.group())) {
                                    is = true;
                                    break;
                                }
                            }
                            if (is) {
                                continue;
                            }
                            if (mUrl.group().endsWith("jpg")) {
                                continue;
                            }
                            if (mUrl.group().endsWith("png")) {
                                continue;
                            }
                            //System.out.println(mUrl.group());
                            doutHttp.writeBytes(mUrl.group() + "
    ");
                            doutHttp.flush();//流刷新缓存
                            new Thread(new RunThread(new URL(mUrl.group()), time--,
                                    dout, doutHttp)).start();
                            //creat(mUrl.group(), new URL(mUrl.group()),time--, dout,doutHttp);
                        } catch (Exception e) {
                            //System.out.println("URL错误");
                            return;
                        }
                    }
    
                    while (m.find()) {
                        BufferedReader br2 = new BufferedReader(new
    
                        InputStreamReader(new FileInputStream("chx.txt")));
                        String s = null;
                        boolean is = false;
    
                        while ((s = br2.readLine()) != null) {
                            if (s.equals(m.group())) {
                                is = true;
                                break;
                            }
                        }
    
                        if (is) {
                            continue;
    
                        }
                        dout.writeBytes(m.group() + "
    ");
                        dout.flush();
                        //System.out.println(m.group());
                    }
    
                }
            } catch (FileNotFoundException e) {
                //System.out.println("文件错误");
                return;
            } catch (IOException e) {
                //System.out.println("URL异常");
                return;
            }
        }
    }

    程序主界面图:

  • 相关阅读:
    图的应用详解-数据结构
    图的遍历
    node.js基础模块http、网页分析工具cherrio实现爬虫
    NodeJS制作爬虫全过程
    Nodejs爬虫进阶教程之异步并发控制
    asp.net的临时文件夹
    Cms WebSite 编译非常慢
    查看数据库的表被谁锁住了,以及如何解锁
    WinRar 设置默认的压缩格式为zip
    Can not Stop-Computer in powershell 6.0
  • 原文地址:https://www.cnblogs.com/webmen/p/5739189.html
Copyright © 2011-2022 走看看