任务描述:
某图书网站按条件查询得出一页20条记录,每条记录有书目的简要信息和url链接到书的详细信息。
需要抓取网站图书的详细信息,保存到本地数据库中。
任务分析:
用httpclient模拟执行url将网站的信息取回,再用jericho包,分析页面元素,将需要的信息取出,保存到数据库中。
因为数据量比较大,还是采用多线程的方式来执行抓取详细页面,分析获得数据。
处理过程:
按条件查询到图书列表信息后,主线程不停的下翻页面,分析每本图书的详细url,将url保存到一个ArrayList中,启多个子线程分别去抓取详细页面的信息。然后利用jericho包分析页面数据并保存到数据库中。
代码实现截取如下:
......
public class BookCatcher
{
private static ArrayList threads= new ArrayList();//存储未处理URL
public static boolean isFinished=false;
public String getUrl() {
try {
synchronized (threads) {
if (threads.size() > 0) {
String tmp = String.valueOf(threads.get(0));
threads.remove(0);
return tmp;
} else
return null;
}
} catch (Exception e) {
return null;
}
}
public void process(){
//处理预处理
//下面开10个线程等待处理
new Thread(new Processer(this)).start();
new Thread(new Processer(this)).start();
new Thread(new Processer(this)).start();
new Thread(new Processer(this)).start();
new Thread(new Processer(this)).start();
new Thread(new Processer(this)).start();
new Thread(new Processer(this)).start();
new Thread(new Processer(this)).start();
new Thread(new Processer(this)).start();
new Thread(new Processer(this)).start();
....
for(int j=0;j<pages;j++)//从第一页翻到最后一页
{
...
source = CommonUtil.getSourceByUrl(url);
List<Element> elements = source.getAllElementsByClass("ProductTitle");
for (Element element : elements){
String href = element.getContent().getFirstStartTag().getAttributeValue("href");
if (href!=null && !"".equals(href)){
synchronized (threads) {
threads.add(bookurl);// 把URL存进去
}
}
}
isFinished=true; //主线程处理完所有的url
}
class Processer implements Runnable
{
BookCatcher c;
public Processer(BookCatcher c)
{
this.c = c;
}
public void run()
{
String bookUrl = null;
while((bookUrl=c.getUrl())!=null || !BookCatcher.isFinished) //当还有记录时就处理
{
if(bookUrl!=null)
{
//处理分析页面数据并将数据保存到数据库
Source source = CommonUtil.getSourceByUrl(bookUrl);
String tmp = "";
BookBean bean = new BookBean();
bean.setStoreBookUrl(bookUrl);
//书名
StartTag tag = source.getFirstStartTagByClass("BookTitle");
tmp = tag.getRenderer().toString();
bean.setName(tmp);
//作者
tag = source.getFirstStartTagByClass("bookAuthor");
if (tag!=null){
List<StartTag> list = tag.getElement().getAllStartTags(HTMLElementName.A);
if (list.size()>0)
bean.setAuthor(list.get(0).getElement().getContent().toString());
}
//书籍图片
tag = source.getFirstStartTag("id", "BookImage", false);
if (tag!=null)
bean.setPicUrl(tag.getAttributeValue("src").trim());
StartTag tagLeft = source.getFirstStartTagByClass("Left");
tmp=tagLeft.getRenderer().toString();
List<String> resList = new ArrayList<String>();
String[] leftArray = tmp.split("·");
for (String str:leftArray){
if ("".equals(str)) continue;
resList.add(str);
}
StartTag tagRight = source.getFirstStartTagByClass("Right");
tmp = tagRight.getRenderer().toString();
String[] rightArray = tmp.split("·");
for (String str:rightArray){
if ("".equals(str)) continue;
resList.add(str);
}
for (String str:resList){
try{
String name = CommonUtil.getString(str.split(":")[0]);
String value = CommonUtil.getString(str.split(":")[1]);
if ("ISBN".equals(name)) bean.setIsbn(value);
if ("出版社".equals(name)) bean.setPublisherOrg(value);
if ("页码".equals(name)) bean.setPageNum(value);
if ("出版日期".equals(name)) bean.setPublishDate(value);
if ("装帧".equals(name)) bean.setWrapType(value);
if ("开本".equals(name)) bean.setFormat(value);
}catch(ArrayIndexOutOfBoundsException ee){}
}
//定价
tag = source.getFirstStartTagByClass("BookPrice");
String price = tag.getElement().getAllStartTags(HTMLElementName.STRIKE).get(0).getRenderer().toString();
price = price.substring(1,price.length());
bean.setPrice(price);
//零售价格
tag = source.getFirstStartTagByClass("DetailPrice");
if (tag!=null)
bean.setStorePrice(tag.getElement().getAllStartTagsByClass("OurPrice").get(0).getRenderer().toString());
else
bean.setStorePrice("0");
List<StartTag> tagList = source.getAllStartTagsByClass("ContentValue");
if(tagList!=null && tagList.size()>1){
// 内容简介
tag = tagList.get(0);
tmp = tag.getRenderer().toString().trim();
if(tmp.length()>2000)
tmp = tmp.substring(0, 1990)+"...";
bean.setContent(tmp);
}
new BookBO().saveBook(bean);
}else//如果没标志处理则休眠一秒再重新开始处理
{
try
{
Thread.sleep(1000);
} catch (InterruptedException e)
{
e.printStackTrace();
}
}
}
}
}
//CommonUtil中的方法,通过httpclient提交到url,返回的页面信息装入jericho的source
public static Source getSourceByUrl(final String url) {
Source source = null;
HttpClient httpClient = new HttpClient();
GetMethod getMethod = new GetMethod(url);
getMethod.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
new DefaultHttpMethodRetryHandler());
try {
int statusCode = httpClient.executeMethod(getMethod);
if (statusCode != HttpStatus.SC_OK) {
log.error("Method failed: " + getMethod.getStatusLine());
}
source = new Source(getMethod.getResponseBodyAsStream());
} catch (HttpException e) {
log.error("Please check your provided http address!");
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
getMethod.releaseConnection();
}
return source;
}
关键字: httpclient, jericho, 多线程