zoukankan      html  css  js  c++  java
  • 博客导出工具(C++实现,支持sina,csdn,自定义列表)


    操作系统:windowAll

    编程工具:visual studio 2013

    编程语言:VC++

        

          最近博文更新的较频繁,为了防止账号异常引起csdn博文丢失,所以花了点时间做了个小工具来导出博文,用做备份。本文将从源码分析整个实现过程。先看个截图:




    操作步骤:

    1. 先在博客地址文本框输入博客地址例如:http://blog.csdn.net/yxstars/,http://blog.sina.com.cn/yxstars/,
      http://www.cnblogs.com/yxstars/
    2. 然后点击确定,将显示共有多少篇博文,例如:[19:32:47]博文113篇
    3. 点击文章列表:将显示所有博文,格式:title,href
    4. 点击导出博文:将导出博文,在当前目录下的blog文件夹中。博文格式为html。
    5. 遍历博文:将遍历所有博文并且显示出来。
    6. 刷新:刷新所有博文,不显示。
    7. 图片:导出的博文,图片下载到本地,博文图片链接到本地。
    8. 列表:支持自定义的列表链接博文(当前目录下有个list.ini,可以自定义链接)。
    9. 刷新次数:自定义,循环次数。
    10. 时间间隔:每次循环sleep时间。

    源码分析:


    1. 获取对应的url页面源代码,实现如下:


    bool CBlogExportDlg::GetUrlStr(CString strUrl, CString& UrlData)
    {
    	CInternetSession session;
    	CHttpFile *file = NULL;
    	try{
    		file = (CHttpFile*)session.OpenURL(strUrl);
    	}
    	catch (CInternetException *m_pException){
    		file = NULL;
    		m_pException->m_dwError;
    		m_pException->Delete();
    		session.Close();
    		ShowMes("网络连接错误...");
    		return false;
    	}
    
    	if (!file){
    		ShowMes(strUrl + "获取失败...");
    		return false;
    	}
    
    	CString sRecived;
    	while (file->ReadString(sRecived) != NULL) {
    		UrlData += sRecived + "
    ";
    	}
    	session.Close();
    	file->Close();
    	delete file; 
    	file = NULL;
    	return true;
    }


    2. 获取的html源码为utf8格式,需要转为ansi格式,C++实现代码如下:


    int CBlogExportDlg::ConvUtf8ToAnsi(CString& strSource, CString& strChAnsi)
    {
    	if (strSource.GetLength() <= 0)
    		return 0;
    
    	CString strWChUnicode;
    
    	strSource.TrimLeft();
    	strSource.TrimRight();
    	strChAnsi.Empty();
    
    	int iLenByWChNeed = MultiByteToWideChar(CP_UTF8, 0,
    		strSource.GetBuffer(0),
    		strSource.GetLength(), //MultiByteToWideChar
    		NULL, 0);
    
    	int iLenByWchDone = MultiByteToWideChar(CP_UTF8, 0,
    		strSource.GetBuffer(0),
    		strSource.GetLength(),
    		(LPWSTR)strWChUnicode.GetBuffer(iLenByWChNeed * 2),
    		iLenByWChNeed); //MultiByteToWideChar
    
    	strWChUnicode.ReleaseBuffer(iLenByWchDone * 2);
    
    	int iLenByChNeed = WideCharToMultiByte(CP_ACP, 0,
    		(LPCWSTR)strWChUnicode.GetBuffer(0),
    		iLenByWchDone,
    		NULL, 0,
    		NULL, NULL);
    
    	int iLenByChDone = WideCharToMultiByte(CP_ACP, 0,
    		(LPCWSTR)strWChUnicode.GetBuffer(0),
    		iLenByWchDone,
    		strChAnsi.GetBuffer(iLenByChNeed),
    		iLenByChNeed,
    		NULL, NULL);
    
    	strChAnsi.ReleaseBuffer(iLenByChDone);
    
    	if (iLenByWChNeed != iLenByWchDone || iLenByChNeed != iLenByChDone)
    		return 1;
    
    	return 0;
    }
    


    3. 消息文本框显示


    void CBlogExportDlg::ShowMes(CString mes)
    {
    	CTime time;
    	time = CTime::GetCurrentTime();//Get the current time
    	CString Times = _T("[") + time.Format("%H:%M:%S") + "]";//Conversion time format
    
    	int len = MesEdit.GetWindowTextLength();
    	MesEdit.SetSel(len, len);
    	MesEdit.ReplaceSel(Times + mes + _T("
    "));
    }


    4. 点击确定按钮后,实现代码


    void CBlogExportDlg::OnBnClickedButtonOk()
    {
    	GetDlgItemText(IDC_EDIT_ADDRESS, blogAdr);
    	ShowBlogAdr();
    	//blogAdr = ("http://blog.csdn.net/yxstars/");
    	int pos = blogAdr.Find("http://blog.csdn.net/");
    	if (pos == -1){
    		ShowMes("csdn blog地址不对...");
    	}
    	blogAdrs = blogAdr;
    
    	CString urlData;
    	if (!GetUrlStr(blogAdr, urlData)){
    		return;
    	}
    
    	CFile fs;
    	if (!fs.Open(strDirPath + "temp", CFile::modeCreate | CFile::modeWrite)){
    		return;
    	}
    
    	fs.Write(urlData, urlData.GetLength());
    	fs.Close();
    
    	CString ansiUrlData;
    	ConvUtf8ToAnsi(urlData, ansiUrlData);
    	GetBlogInfo(ansiUrlData);
    
    }


    5. 根据博客地址,获取源代码后分析,查找博文数目,和博文列表页数。


    <!--显示分页 -->
    
    <div id="papelist" class="pagelist">
    <span> 113条数据  共6页</span><strong>1</strong> <a href="/yxstars/article/list/2">2</a> <a href="/yxstars/article/list/3">3</a> <a href="/yxstars/article/list/4">4</a> <a href="/yxstars/article/list/5">5</a> <a href="/yxstars/article/list/6">...</a> <a href="/yxstars/article/list/2">下一页</a> <a href="/yxstars/article/list/6">尾页</a> 
    </div>

    从上面的代码中可以获取信息如下:

    <span> 113条数据  共6页</span>, 共有113篇博文,共有6页。

     <a href="/yxstars/article/list/3">,页面链接地址为/yxstars/article/list/ + 要显示的页数。


    C++代码实现如下:

    void CBlogExportDlg::GetBlogInfo(CString& urlData)
    {
    	int pos = urlData.Find("<div id="papelist" class="pagelist">");
    	if (pos == -1){
    		ShowMes("获取列表数目失败...");
    		return;
    	}
    	urlData = urlData.Mid(pos + 44);
    	pos = urlData.Find("条数据");
    	if (pos == -1){
    		ShowMes("获取列表条数失败...");
    		return;
    	}
    
    	CString blogListNum = urlData.Left(pos);
    	
    	pos = urlData.Find("条数据  共");
    	int poss = urlData.Find("页</span>");
    	if ((poss == -1) || (pos == -1)){
    		ShowMes("获取列表页数失败...");
    		return;
    	}
    
    	CString listPage = urlData.Mid(pos + 10, poss - pos - 10);
    	blogListPage = StrToInt(listPage);
    	ShowMes("博文" + blogListNum + "篇");
    }


    6. 当点击显示列表时,根据之前的页面地址获取信息。

    void CBlogExportDlg::OnBnClickedButtonList()
    {
    	clearMes();
    	CString urlData, ansiUrlData, listPage;
    	//http://blog.csdn.net/yxstars/article/list/1
    	FileListMap.clear();
    	listNum = 1;
    
    	for (int i = 1; i < blogListPage + 1; i++){
    		urlData.Empty();
    		ansiUrlData.Empty();
    		listPage.Format("%d", i);
    		blogAdr = blogAdrs + "/article/list/" + listPage;
    		ShowBlogAdr();
    		if (!GetUrlStr(blogAdr, urlData)){
    			return;
    		}
    
    		ConvUtf8ToAnsi(urlData, ansiUrlData);
    		GetFileList(ansiUrlData);
    	}
    
    }


    7. 在每个页面获取文章列表和页面地址。

        <h1>
            <span class="link_title"><a href="/yxstars/article/details/38469431">
            <font color="red">[置顶]</font>
            金融系列12《双币电子现金方案》
            </a></span>
        </h1>
    

    从上面源码可以看出:

    <span class="link_title">后面就是博文链接地址。

     </a>前面的就是博文标题。

    如果有置顶操作,会多出这部分<font color="red">[置顶]</font>


    C++获取源码实现如下:

    void CBlogExportDlg::GetFileList(CString& urlData)
    {	
    	CString strListNum;
    	int posF = urlData.Find("<span class="link_title">");
    	while (posF != -1){
    		urlData = urlData.Mid(posF + 34);
    		int posE = urlData.Find(""");
    		if (posE == -1){
    			ShowMes("获取列表失败...");
    			return;
    		}
    
    		CString href = urlData.Left(posE);
    		posF = urlData.Find("</a>");
    		if (posF == -1){
    			ShowMes("获取列表失败...");
    			return;
    		}
    
    		CString title = urlData.Mid(posE+2, posF-posE-2);
    		posF = title.ReverseFind('>');
    		if (posF != -1){
    			title = title.Mid(posF + 1);
    		}
    		title.Trim("
    ").Trim();
    		href = "http://blog.csdn.net" + href;
    		FileListMap[title] = href;
    		strListNum.Format("%03d", listNum++);
    		strListNum = (strListNum + ":" + title + "                                            ").Left(45);
    		ShowMes(strListNum + href);
    		posF = urlData.Find("<span class="link_title">");
    	}
    }


    8. 当点击导出博文时,我们只需把源代码保存为html格式即可,采用多线程实现:

    void CBlogExportDlg::OnBnClickedButtonExport()
    {
    	clearMes();
    	unsigned tid;
    	unsigned long thd = _beginthreadex(NULL, 0, CBlogExportDlg::WriteCycle, this, 0, &tid);
    	if (thd != NULL)
    	{
    		CloseHandle((HANDLE)thd);
    	}
    
    }
    
    unsigned __stdcall  CBlogExportDlg::WriteCycle(void* p)
    {
    	CBlogExportDlg* dlg = (CBlogExportDlg*)p;
    	CString blogFolderPath = dlg->strDirPath + "Blog\";
    	if (!PathIsDirectory(blogFolderPath))
    	{
    		if (!CreateDirectory(blogFolderPath, NULL))
    		{
    			dlg->ShowMes(blogFolderPath + "创建失败...");
    			return 1;
    		}
    	}
    	
    
    	dlg->stopRun = false;
    	CString urlData, strList;
    	int iList = 1;
    	CFile cf;
    	std::map<CString, CString>::iterator iter;
    	for (iter = dlg->FileListMap.begin(); iter != dlg->FileListMap.end(); iter++){
    		//dlg->blogAdr = iter->second;
    		//dlg->ShowBlogAdr();
    		urlData.Empty();
    		if (!dlg->GetUrlStr(iter->second, urlData)){
    			return 1;
    		}
    		strList.Format("%3d", iList++);
    		dlg->ShowMes("正在导出第" + strList + "篇博文:" + iter->first);
    		CString blogPath(iter->first);
    		blogPath.Replace('\', '_');
    		blogPath.Replace('/', '_');
    		blogPath = blogFolderPath + blogPath + ".html";
    		if (!cf.Open(blogPath, CFile::modeCreate | CFile::modeWrite)){
    			dlg->ShowMes("创建文件失败" + blogPath);
    			return 2;
    		}
    		cf.Write(urlData, urlData.GetLength());
    		cf.Close();
    
    		if (dlg->stopRun){
    			return 1;
    		}
    
    	}
    	return 0;
    }


    9. 遍历博文时,只需依次访问之前保存的链接即可,实现如下:

    void CBlogExportDlg::OnBnClickedButtonRead()
    {
    	clearMes();
    	unsigned tid;
    	unsigned long thd = _beginthreadex(NULL, 0, CBlogExportDlg::ReadCycle, this, 0, &tid);
    	if (thd != NULL)
    	{
    		CloseHandle((HANDLE)thd);
    	}
    }
    
    
    unsigned __stdcall  CBlogExportDlg::ReadCycle(void* p)
    {
    	CBlogExportDlg* dlg = (CBlogExportDlg*)p;
    	dlg->stopRun = false;
    	std::map<CString, CString>::iterator iter;
    	for (iter = dlg->FileListMap.begin(); iter != dlg->FileListMap.end(); iter++){
    		dlg->blogAdr = iter->second;
    		dlg->ShowBlogAdr();
    		dlg->ShowMes("正在遍历博文:" + iter->first);
    		Sleep(3000);
    		if (dlg->stopRun){
    			return 1;
    		}
    
    
    	}
    	return 0;
    }


    CSDN免积分下载地址:

    2014.08.01更新: http://download.csdn.net/detail/yxstars/7786309

    2014.09.05更新:http://download.csdn.net/detail/yxstars/7867583



    文/yanxin8原创,获取更多信息请访问http://yanxin8.com/222.html








  • 相关阅读:
    js-url打开方式
    eclipse删除所有空行
    Oracle重启 error: ora-01034:oracle not available ora-27101:shared memory realm does not exist
    最近面试遇到了高阶函数的概念混乱了
    关于跨域的cookie问题
    二叉树 呜呜
    函数的尾递归
    react context
    二叉树
    dom3级事件
  • 原文地址:https://www.cnblogs.com/iplus/p/4467109.html
Copyright © 2011-2022 走看看