zoukankan      html  css  js  c++  java
  • 抓取csdn上的各类别的文章 (制作csdn app 二)

    转载请表明出处:http://blog.csdn.net/lmj623565791/article/details/23532797

    这篇博客接着上一篇Android 使用Fragment,ViewPagerIndicator 制作csdn app主要框架继续实现接下来的功能,如果你想了解整个app的制作过程,你可以去看一下上一篇,当然如果你只对网页信息的抓取感兴趣,你可以直接阅读本篇博客。我会把app功能分解,尽可能的每篇之间的耦合度不会太高。

    好了,开始进入正题。这篇内容我新建一个java项目实现,一方面java调试比较方便,另一方面我会使用导入jar包的方式,把这个项目导入到android项目使用,大家如果在导jar方面没有经验,可以看下。

    先看下项目结构:

          

    定义了一个NewsBean对于app的每个ListView的Item,Constaint是个接口,存放了一些常量,还有就是一些辅助类。

    NewsItem.java

    package com.zhy.bean;
    
    public class NewsItem
    {
    	private int id;
    
    	/**
    	 * 标题
    	 */
    	private String title;
    	/**
    	 * 链接
    	 */
    	private String link;
    	/**
    	 * 发布日期
    	 */
    	private String date;
    	/**
    	 * 图片的链接
    	 */
    	private String imgLink;
    	/**
    	 * 内容
    	 */
    	private String content;
    
    	/**
    	 * 类型  
    	 * 
    	 */
    	private int newsType;
    
    	public int getNewsType()
    	{
    		return newsType;
    	}
    
    	public void setNewsType(int newsType)
    	{
    		this.newsType = newsType;
    	}
    
    	public String getTitle()
    	{
    		return title;
    	}
    
    	public void setTitle(String title)
    	{
    		this.title = title;
    	}
    
    	public String getLink()
    	{
    		return link;
    	}
    
    	public void setLink(String link)
    	{
    		this.link = link;
    	}
    
    	public int getId()
    	{
    		return id;
    	}
    
    	public void setId(int id)
    	{
    		this.id = id;
    	}
    
    	public String getDate()
    	{
    		return date;
    	}
    
    	public void setDate(String date)
    	{
    		this.date = date;
    	}
    
    	public String getImgLink()
    	{
    		return imgLink;
    	}
    
    	public void setImgLink(String imgLink)
    	{
    		this.imgLink = imgLink;
    	}
    
    	public String getContent()
    	{
    		return content;
    	}
    
    	public void setContent(String content)
    	{
    		this.content = content;
    	}
    
    	@Override
    	public String toString()
    	{
    		return "NewsItem [id=" + id + ", title=" + title + ", link=" + link + ", date=" + date + ", imgLink=" + imgLink
    				+ ", content=" + content + ", newsType=" + newsType + "]";
    	}
    
    }
    

    CommonException.java

    package com.zhy.bean;
    
    public class CommonException extends Exception
    {
    
    	public CommonException()
    	{
    		super();
    		// TODO Auto-generated constructor stub
    	}
    
    	public CommonException(String message, Throwable cause)
    	{
    		super(message, cause);
    		// TODO Auto-generated constructor stub
    	}
    
    	public CommonException(String message)
    	{
    		super(message);
    		// TODO Auto-generated constructor stub
    	}
    
    	public CommonException(Throwable cause)
    	{
    		super(cause);
    		// TODO Auto-generated constructor stub
    	}
    	
    }
    

    Constaint.java

    package com.zhy.csdn;
    
    public interface Constaint
    {
    	public static final int NEWS_TYPE_YEJIE = 1;
    	public static final int NEWS_TYPE_YIDONG = 2;
    	public static final int NEWS_TYPE_YANFA = 3;
    	public static final int NEWS_TYPE_CHENGXUYUAN = 4;
    	public static final int NEWS_TYPE_YUNJISUAN = 5; 
    	
    
    }
    
    DataUtil.java

    package com.zhy.csdn;
    
    import java.io.InputStream;
    import java.net.HttpURLConnection;
    import java.net.URL;
    
    import com.zhy.bean.CommonException;
    
    public class DataUtil
    {
    
    	/**
    	 * 返回该链接地址的html数据
    	 * 
    	 * @param urlStr
    	 * @return
    	 * @throws CommonException
    	 */
    	public static String doGet(String urlStr) throws CommonException
    	{
    		StringBuffer sb = new StringBuffer();
    		try
    		{
    			URL url = new URL(urlStr);
    			HttpURLConnection conn = (HttpURLConnection) url.openConnection();
    			conn.setRequestMethod("GET");
    			conn.setConnectTimeout(5000);
    			conn.setDoInput(true);
    			conn.setDoOutput(true);
    
    			if (conn.getResponseCode() == 200)
    			{
    				InputStream is = conn.getInputStream();
    				int len = 0;
    				byte[] buf = new byte[1024];
    
    				while ((len = is.read(buf)) != -1)
    				{
    					sb.append(new String(buf, 0, len, "UTF-8"));
    				}
    
    				is.close();
    			} else
    			{
    				throw new CommonException("访问网络失败!");
    			}
    
    		} catch (Exception e)
    		{
    			throw new CommonException("访问网络失败!");
    		}
    		return sb.toString();
    	}
    
    	
    	
    
    }
    

    URLUtil.java

    package com.zhy.csdn;
    
    
    public class URLUtil
    {
    
    
    	public static final String NEWS_LIST_URL = "http://www.csdn.net/headlines.html";
    	public static final String NEWS_LIST_URL_YIDONG = "http://mobile.csdn.net/mobile";
    	public static final String NEWS_LIST_URL_YANFA = "http://sd.csdn.net/sd";
    	public static final String NEWS_LIST_URL_YUNJISUAN = "http://cloud.csdn.net/cloud";
    	public static final String NEWS_LIST_URL_ZAZHI = "http://programmer.csdn.net/programmer";
    	public static final String NEWS_LIST_URL_YEJIE = "http://news.csdn.net/news";
    
    
    	/**
    	 * 根据文章类型,和当前页码生成url
    	 * @param newsType
    	 * @param currentPage
    	 * @return
    	 */
    	public static String generateUrl(int newsType, int currentPage)
    	{
    		currentPage = currentPage > 0 ? currentPage : 1;
    		String urlStr = "";
    		switch (newsType)
    		{
    		case Constaint.NEWS_TYPE_YEJIE:
    			urlStr = NEWS_LIST_URL_YEJIE;
    			break;
    		case Constaint.NEWS_TYPE_YANFA:
    			urlStr = NEWS_LIST_URL_YANFA;
    			break;
    		case Constaint.NEWS_TYPE_CHENGXUYUAN:
    			urlStr = NEWS_LIST_URL_ZAZHI;
    			break;
    		case Constaint.NEWS_TYPE_YUNJISUAN:
    			urlStr = NEWS_LIST_URL_YUNJISUAN;
    			break;
    		default:
    			urlStr = NEWS_LIST_URL_YIDONG;
    			break;
    		}
    
    
    		urlStr += "/" + currentPage;
    		
    		return urlStr;
    
    
    	}
    
    
    }

    NewsItemBiz.java业务类

    package com.zhy.biz;
    
    import java.util.ArrayList;
    import java.util.List;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import com.zhy.bean.CommonException;
    import com.zhy.bean.NewsItem;
    import com.zhy.csdn.DataUtil;
    import com.zhy.csdn.URLUtil;
    
    /**
     * 处理NewItem的业务类
     * @author zhy
     * 
     */
    public class NewsItemBiz
    {
    	/**
    	 * 业界、移动、云计算
    	 * 
    	 * @param htmlStr
    	 * @return
    	 * @throws CommonException 
    	 */
    	public List<NewsItem> getNewsItems( int newsType , int currentPage) throws CommonException
    	{
    		String urlStr = URLUtil.generateUrl(newsType, currentPage);
    		
    		String htmlStr = DataUtil.doGet(urlStr);
    		
    		List<NewsItem> newsItems = new ArrayList<NewsItem>();
    		NewsItem newsItem = null;
    
    		Document doc = Jsoup.parse(htmlStr);
    		Elements units = doc.getElementsByClass("unit");
    		for (int i = 0; i < units.size(); i++)
    		{
    			newsItem = new NewsItem();
    			newsItem.setNewsType(newsType);
    
    			Element unit_ele = units.get(i);
    
    			Element h1_ele = unit_ele.getElementsByTag("h1").get(0);
    			Element h1_a_ele = h1_ele.child(0);
    			String title = h1_a_ele.text();
    			String href = h1_a_ele.attr("href");
    
    			newsItem.setLink(href);
    			newsItem.setTitle(title);
    
    			Element h4_ele = unit_ele.getElementsByTag("h4").get(0);
    			Element ago_ele = h4_ele.getElementsByClass("ago").get(0);
    			String date = ago_ele.text();
    
    			newsItem.setDate(date);
    
    			Element dl_ele = unit_ele.getElementsByTag("dl").get(0);// dl
    			Element dt_ele = dl_ele.child(0);// dt
    			try
    			{// 可能没有图片
    				Element img_ele = dt_ele.child(0);
    				String imgLink = img_ele.child(0).attr("src");
    				newsItem.setImgLink(imgLink);
    			} catch (IndexOutOfBoundsException e)
    			{
    
    			}
    			Element content_ele = dl_ele.child(1);// dd
    			String content = content_ele.text();
    			newsItem.setContent(content);
    			newsItems.add(newsItem);
    		}
    
    		return newsItems;
    
    	}
    
    }
    
    好了,最后就是测试了,这里使用单元测试,下面是测试代码和结果。

    测试代码:

    package com.zhy.test;
    
    import java.util.List;
    
    import com.zhy.bean.CommonException;
    import com.zhy.bean.NewsItem;
    import com.zhy.biz.NewsItemBiz;
    import com.zhy.csdn.Constaint;
    import com.zhy.csdn.DataUtil;
    
    public class Test
    {
    
    	@org.junit.Test
    	public void test01()
    	{
    		NewsItemBiz biz = new NewsItemBiz();
    		int currentPage = 1;
    		try
    		{
    			/**
    			 * 业界
    			 */
    			List<NewsItem> newsItems = biz.getNewsItems(Constaint.NEWS_TYPE_YEJIE, currentPage);
    			for (NewsItem item : newsItems)
    			{
    				System.out.println(item);
    			}
    
    			System.out.println("----------------------");
    			/**
    			 * 程序员杂志
    			 */
    			newsItems = biz.getNewsItems(Constaint.NEWS_TYPE_CHENGXUYUAN, currentPage);
    			for (NewsItem item : newsItems)
    			{
    				System.out.println(item);
    			}
    			System.out.println("----------------------");
    			/**
    			 * 研发
    			 */
    			newsItems = biz.getNewsItems(Constaint.NEWS_TYPE_YANFA, currentPage);
    			for (NewsItem item : newsItems)
    			{
    				System.out.println(item);
    			}
    			System.out.println("----------------------");
    			/**
    			 * 移动
    			 */
    			newsItems = biz.getNewsItems(Constaint.NEWS_TYPE_YIDONG, currentPage);
    			for (NewsItem item : newsItems)
    			{
    				System.out.println(item);
    			}
    			System.out.println("----------------------");
    
    		} catch (CommonException e)
    		{
    			e.printStackTrace();
    		}
    	}
    
    }
    

    结果:

    NewsItem [id=0, title=如何做到每天写代码?, date=2014-04-11 11:26, newsType=1]
    NewsItem [id=0, title=一周消息树:超级充电器来袭,30秒可为手机充满电, date=2014-04-11 15:20, newsType=1]
    NewsItem [id=0, title=Google Glass于4月15日在美对外开放购买,售价为1500美元, date=2014-04-11 08:01, newsType=1]
    NewsItem [id=0, title=Cortana与Siri、Google Now的较量:支持功能更多, date=2014-04-10 16:30, newsType=1]
    NewsItem [id=0, title=优秀Unix管理员的七个习惯, date=2014-04-10 10:58, newsType=1]
    NewsItem [id=0, title=国外用户也不幸福!Facebook强制用户必须下载Messager, date=2014-04-10 09:10, newsType=1]
    NewsItem [id=0, title=ThoughtWorks CTO谈IT职场女性:你并不奇怪, date=2014-04-09 18:18, newsType=1]
    NewsItem [id=0, title=微软转型之路:从Build 2014开始, date=2014-04-09 17:05, newsType=1]
    NewsItem [id=0, title=设计师为什么要学编程,开发者为什么要学设计?, date=2014-04-09 14:07, newsType=1]
    NewsItem [id=0, title=Windows 8.1 Update 1的下载地址和八点???知, date=2014-04-09 08:38, newsType=1]
    ----------------------
    NewsItem [id=0, title=页面仔和他的小创新, date=2014-04-11 11:09, newsType=4]
    NewsItem [id=0, title=未来两年必须掌握的移动互联网技术与能力, date=2014-04-10 14:43, newsType=4]
    NewsItem [id=0, title=互联网思维到底是什么——移动浪潮下的新商业逻辑, date=2014-04-09 13:05, newsType=4]
    NewsItem [id=0, title=虚拟现实之眼——Oculus与HMD关键技术, date=2014-04-09 12:47, newsType=4]
    NewsItem [id=0, title=如何实现团队的自组织管理, date=2014-04-09 11:59, newsType=4]
    NewsItem [id=0, title=途牛网CTO汤峥嵘:互联网思维——光说不练远远不够, date=2014-04-08 11:10, newsType=4]
    NewsItem [id=0, title=理解创客, date=2014-04-04 17:55, newsType=4]
    NewsItem [id=0, title=TypeScript:更好的JavaScript, date=2014-04-03 16:10, newsType=4]
    NewsItem [id=0, title=Chris Anderson:我们正经历一场真正的革命, date=2014-04-02 14:45, newsType=4]
    NewsItem [id=0, title=Cocos2d-x 3.0带来了什么, date=2014-04-02 14:09, newsType=4]
    ----------------------
    NewsItem [id=0, title=研发周报:Perl创历史新低, date=2014-04-11 14:13, newsType=3]
    NewsItem [id=0, title=代码面试最常用的10大算法, date=2014-04-10 11:34, newsType=3]
    NewsItem [id=0, title=TIOBE 2014年4月编程语言排行榜:Perl跌至历史最低点, date=2014-04-10 09:20, newsType=3]
    NewsItem [id=0, title=金蝶发布Apusic智慧云平台 构建产业联盟推动信息化建设, date=2014-04-09 10:38, newsType=3]
    NewsItem [id=0, title=OpenSSL究竟为何物,为何它的影响力如此之大?, date=2014-04-09 08:52, newsType=3]
    NewsItem [id=0, title=Airbnb的管理之道:产品设计的点评策略与技巧, date=2014-04-09 07:01, newsType=3]
    NewsItem [id=0, title=大势所趋 HTML5成Web开发者最关心的技术, date=2014-04-08 14:30, newsType=3]
    NewsItem [id=0, title=研发周报:微软Build2014精华汇总, date=2014-04-04 16:09, newsType=3]
    NewsItem [id=0, title=Facebook发布PlanOut 开源部分A/B测试源码, date=2014-04-04 11:02, newsType=3]
    NewsItem [id=0, title=撼动企业应用架构的十大技术趋势, date=2014-04-08 14:40, newsType=3]
    ----------------------
    NewsItem [id=0, title=2014移动开发者必备的十大应用测试工具, date=22小时前, newsType=2]
    NewsItem [id=0, title=前《连线》主编Chris Anderson:创客就要DIT, date=22小时前, newsType=2]
    NewsItem [id=0, title=创客天下——《Make》及Maker Faire创办人、O'Reilly Media创始人Dale Dougherty专访, date=2014-04-11 11:21, newsType=2]
    NewsItem [id=0, title=《近匠》aGlass团队:透析眼控技术的价值, date=2014-04-11 10:51, newsType=2]
    NewsItem [id=0, title=UC多屏战略 推出电脑版和电视版浏览器, date=2014-04-11 07:07, newsType=2]
    NewsItem [id=0, title=“颠覆医疗” 时云医疗推三款硬件产品, date=2014-04-10 21:05, newsType=2]
    NewsItem [id=0, title=2014Unity亚洲开发者大会倒计时 干货内容日程汇总, date=2014-04-10 10:06, newsType=2]
    NewsItem [id=0, title=《近匠》棱镜:手游渠道SDK平台的技术历程, date=2014-04-09 10:27, newsType=2]
    NewsItem [id=0, title=绝对的超现实!Jaunt打造360°全景VR电影, date=2014-04-08 15:45, newsType=2]
    NewsItem [id=0, title=Unite China 2014课程解析:行业解决方案专场免费开放, date=2014-04-08 13:13, newsType=2]
    ----------------------
    

    好了,最后打成jar,在下篇博客中会放入咱们待完成Android的项目中使用。



    如果你觉得这篇文章对你有帮助,可以顶一个。


    源码点击下载


  • 相关阅读:
    序列JSON数据和四种AJAX操作方式
    jquery.validate和jquery.form.js实现表单提交
    JQuery Validate使用总结1:
    HOWTO: Include Base64 Encoded Binary Image Data (data URI scheme) in Inline Cascading Style Sheets (CSS)(转)
    SharePoint 2007 使用4.0 .Net
    动态IP解决方案
    取MS CRM表单的URL
    从Iframe或新开的窗口访问MS CRM 2011(转)
    Toggle or Hidden MS CRM Tab
    Windows 2008下修改域用户密码
  • 原文地址:https://www.cnblogs.com/oversea201405/p/3752046.html
Copyright © 2011-2022 走看看