zoukankan      html  css  js  c++  java
  • 爬虫平台设置代理ip

    首先从国外一个网站爬取了免费的代理ip信息存到mongodb中;接着代码设置:

    在爬虫客户端抽象类中添加属性:

    设置代理的代码其实就以下几句:

    firefoxProfile.setPreference("network.proxy.type", 1);
    firefoxProfile.setPreference("network.proxy.no_proxies_on", "localhost, 127.0.0.1"); 

    firefoxProfile.setPreference("network.proxy.http", proxyHttp.getIp());
    firefoxProfile.setPreference("network.proxy.http_port", proxyHttp.getPort());

    firefoxProfile.setPreference("network.proxy.ssl", proxyHttps.getIp());
    firefoxProfile.setPreference("network.proxy.ssl_port", proxyHttps.getPort());

    以下是具体实现代码:

    /**
    * 爬虫客户端抽象类
    * 其生命周期如下
    * setSpiderDao→setRootUrl→setParamsMap→init→runSpider→returnData→destory
    */
    public abstract class SpiderClient {

    private static final Logger logger = LoggerFactory.getLogger(SpiderClient.class);
    protected SpiderDao spiderDao;
    protected SpiderData spiderData;
    protected WebDriver driver;
    protected String rootUrl;
    protected Map<String, Object> params;
    private String collection;
    protected boolean enableProxy;

    //.. get set

    /**
    * 初始化工作
    */
    public void init(){

    FirefoxProfile firefoxProfile = new FirefoxProfile(); 

    // 去掉css
    firefoxProfile.setPreference("permissions.default.stylesheet", 2);
    // 去掉图片
    firefoxProfile.setPreference("permissions.default.image", 2);
    // 去掉flash
    firefoxProfile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", false);
    //设置默认下载
    // 设置是否显示下载进度框
    firefoxProfile.setPreference("browser.download.manager.showWhenStarting", false);
    // browser.download.folderList 设置Firefox的默认 下载 文件夹。0是桌面;1是“我的下载”;2是自定义
    firefoxProfile.setPreference("browser.download.folderList", 2);
    // ,如果使用自定义路径,必须要将browser.download.folderList设置为2
    firefoxProfile.setPreference("browser.download.dir", System.getProperty("java.io.tmpdir")+"material_images");
    // 设置哪种类型的文件下载不询问直接下载
    firefoxProfile.setPreference("browser.helperApps.neverAsk.saveToDisk","image/gif,image/png,image/jpeg,image/bmp,image/webp");
    /*firefoxProfile.setPreference("browser.helperApps.neverAsk.saveToDisk",
    "application/zip,text/plain,application/vnd.ms-excel,text/csv,text/comma-separated-values,application/octet-stream,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.openxmlformats-officedocument.wordprocessingml.document");
    */
    //proxy
    if(enableProxy){
    firefoxProfile.setPreference("network.proxy.type", 1);
    firefoxProfile.setPreference("network.proxy.no_proxies_on", "localhost, 127.0.0.1");

    ProxyIP proxyHttp = getProxyIPForHttp();
    if(proxyHttp!=null){
    firefoxProfile.setPreference("network.proxy.http", proxyHttp.getIp());
    firefoxProfile.setPreference("network.proxy.http_port", proxyHttp.getPort());
    logger.info("Set http proxy: {}:{}",proxyHttp.getIp(),proxyHttp.getPort());
    }
    ProxyIP proxyHttps = getProxyIPForHttps();
    if(proxyHttps!=null){
    firefoxProfile.setPreference("network.proxy.ssl", proxyHttps.getIp());
    firefoxProfile.setPreference("network.proxy.ssl_port", proxyHttps.getPort());
    logger.info("Set https proxy: {}:{}",proxyHttps.getIp(),proxyHttps.getPort());
    }
    }
    this.driver = new FirefoxDriver(firefoxProfile);
    this.driver.manage().timeouts().implicitlyWait(30, TimeUnit.SECONDS);
    this.spiderData = new SpiderData();
    this.spiderData.setIds(new ArrayList<String>());

    }

    //先从China的ip获取(信号相对好,网速快)

    private ProxyIP getProxyIPForHttp(){
    MongoSpiderDao mongoSpiderDao = (MongoSpiderDao) spiderDao;
    List<ProxyIP> list = mongoSpiderDao.getProxyIP("HTTP", "China", 20); //从mongodb中查询20条ip数据
    if(list==null || list.isEmpty()){
    return null;
    }
    return list.get(RandomUtils.nextInt(0, list.size()));
    }
    private ProxyIP getProxyIPForHttps(){
    MongoSpiderDao mongoSpiderDao = (MongoSpiderDao) spiderDao;
    List<ProxyIP> list = mongoSpiderDao.getProxyIP("HTTPS", "China", 20);
    if(list==null || list.isEmpty()){
    return null;
    }
    return list.get(RandomUtils.nextInt(0, list.size()));
    }

    ...

    }

    有个很好的自动化获取有效免费代理ip的项目:https://github.com/yzf233/IPProxyTool,只需要跑一下命令即可;

  • 相关阅读:
    jQuery之元素操作及事件绑定
    JS中常遇到的浏览器兼容问题和解决方法
    九九乘法表
    全选复习
    css基本知识
    js数组
    Spark常见错误问题汇总
    被问懵逼的Kafka面试题
    被问懵逼的数仓面试
    Flink模拟项目: 订单支付实时监控
  • 原文地址:https://www.cnblogs.com/yzf666/p/7150617.html
Copyright © 2011-2022 走看看