zoukankan      html  css  js  c++  java
  • c++ 爬虫

    这是一个简单的c++爬虫,效率并不是很高...

     1 #include<stdio.h>
     2 int s1[1000000],s2[1000000];
     3 void fun(int a, int b)
     4 {
     5     int i,ii;
     6     bool t1,t2,t3,t4;
     7     s1[0] = s2[0] = s1[1] = s2[1] = 0;
     8     for(i=a; i <= b; i++){
     9         ii = i;
    10         t1 = t2 = t3 = t4 =false;
    11         while(ii != 0 ){
    12             int a = ii %10;
    13             if( a == 5)
    14             {
    15                 t1 = true;
    16             }
    17             else if( a == 2)
    18             {
    19                 t2 = true;
    20             }
    21             else if( a == 1)
    22             {
    23                 t3 = true;
    24             }
    25             ii = ii / 10;
    26         }
    27         if(t1 && t2 && t3){
    28             s1[i-1] = s1[i-2] + 1;
    29             ii = i;
    30         while(ii != 0 ){
    31             int a = ii % 10;
    32             int b = (ii / 10) % 10;
    33             int c = (ii / 100) % 10;
    34             if( c > 0 && a == 1 && b == 2 && c ==5)
    35                 t4 = true;
    36             ii = ii / 10;
    37         }
    38         if(t4)
    39             s2[i-1] = s2[i-2] + 1;
    40         else 
    41             s2[i-1] = s2[i-2];
    42         }
    43         else{
    44             s2[i-1] = s2[i-2];
    45             s1[i-1] = s1[i-2];
    46         }
    47     }
    48 }
    49 
    50 int main()
    51 {
    52     int a,b,i=1;
    53     fun(2,1000000);
    54     while(scanf("%d%d",&a,&b) != EOF){
    55         if(a == 1)
    56             printf("Case %d:%d %d
    ",i,s1[b-1]-s1[a-1],s2[b-1]-s2[a-1]);
    57         else
    58             printf("Case %d:%d %d
    ",i,s1[b-1]-s1[a-2],s2[b-1]-s2[a-2]);
    59         i++;
    60     }
    61     return 0;
    62 }
     1 #include"urlThread.h"
     2 #include<QFile>
     3 #include<QMessageBox>
     4 #include<QTextStream>
     5 #include <QMainWindow>
     6 void urlThread::run()
     7 {
     8     open();
     9 }
    10 
    11 void urlThread::startThread()
    12 {
    13     start();
    14 }
    15 
    16 //显示找到的url
    17 void urlThread::open()
    18 {
    19     QString path = "url.txt";
    20     QFile file(path);
    21     if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
    22        // QMessageBox::warning(this,tr("Read File"),
    23         //                     tr("Cannot open file:
    %1").arg(path));
    24         send("error!cannot open url.txt!");
    25         return;
    26     }
    27     QTextStream in(&file);
    28     while(in.readLine().compare("") != 0){
    29         //ui->textBrowser->append(in.readLine());
    30         send(q2s(in.readLine()));
    31         Sleep(1);
    32     }
    33     file.close();
    34 }
    View Code
     1 #include "mainwindow.h"
     2 #include <QApplication>
     3 
     4 int main(int argc, char *argv[])
     5 {
     6     QApplication a(argc, argv);
     7     MainWindow w;
     8     w.setWindowTitle("小小爬虫");
     9     w.show();
    10 
    11     return a.exec();
    12 }
    View Code
     1 #include "mainwindow.h"
     2 #include "ui_mainwindow.h"
     3 
     4 MainWindow::MainWindow(QWidget *parent) :
     5     QMainWindow(parent),
     6     ui(new Ui::MainWindow)
     7 {
     8     ui->setupUi(this);
     9     QObject::connect(ui->start,SIGNAL(released()),this,SLOT(beginGeturl()));
    10     //QObject::connect(ui->display,SIGNAL(released()),this,SLOT(open()));
    11     QObject::connect(ui->display,SIGNAL(released()),&uth,SLOT(startThread()));
    12     QObject::connect(&uth,&urlThread::sendMessage,this,&MainWindow::receiveMessage);
    13     QObject::connect(&crawler,&Crawler::sendMessage,this,&MainWindow::receiveMessage);
    14 }
    15 
    16 MainWindow::~MainWindow()
    17 {
    18     delete ui;
    19 }
    20 
    21 void MainWindow::receiveMessage(const QString name)
    22 {
    23     ui->textBrowser->append(name);
    24     ui->textBrowser->moveCursor(QTextCursor::End);
    25 }
    26 
    27 void MainWindow::open()
    28 {
    29     QString path = "url.txt";
    30     QFile file(path);
    31     if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {
    32         QMessageBox::warning(this,tr("Read File"),
    33                              tr("Cannot open file:
    %1").arg(path));
    34         return;
    35     }
    36     QTextStream in(&file);
    37     while(in.readLine().compare("") != 0){
    38         //ui->textBrowser->append(in.readLine());
    39         crawler.send(q2s(in.readLine()));
    40     }
    41     file.close();
    42 }
    43 
    44 void MainWindow::beginGeturl()
    45 {
    46     //crawler = new Crawler();
    47     string url = "" ,dep, filter = "www";
    48     if(!ui->site->text().isEmpty())
    49         url = q2s(ui->site->text());
    50     crawler.addURL(url);
    51     int depth = 1;
    52     if(!ui->depth->text().isEmpty())
    53     {
    54         url = q2s(ui->depth->text());
    55         depth = atoi(url.c_str());
    56     }
    57     if(!ui->filter->text().isEmpty())
    58         filter = q2s(ui->filter->text());
    59     crawler.setJdugeDomain(filter);
    60     crawler.setDepth(depth);
    61     crawler.startThread();
    62 }
    View Code
     1 #ifndef CRAWLER_H
     2 #define CRAWLER_H
     3 
     4 #include<set>
     5 #include<string>
     6 #include<queue>
     7 #include "winsock2.h"
     8 #include <iostream>
     9 #include <fstream>
    10 #include <stdio.h>
    11 #include<time.h>
    12 #include<winsock.h>
    13 #include<QThread>
    14 
    15 #pragma comment(lib, "ws2_32.lib")
    16 using namespace std;
    17 
    18 bool ParseURL(const string & url, string & host, string & resource);
    19 bool GetHttpResponse(const string & url, char * &response, int &bytesRead);
    20 QString s2q(const string &s);
    21 string q2s(const QString &s);
    22 
    23 #define DEFAULT_PAGE_BUF_SIZE 1000000
    24 
    25 class Crawler: public QThread
    26 {
    27     Q_OBJECT
    28 private:
    29     queue<string> urlWaiting;
    30     set<string> urlWaitset;
    31     set<string> urlProcessed;
    32     set<string> urlError;
    33     set<string> disallow;
    34     set<string>::iterator it;
    35     int numFindUrl;
    36     time_t starttime, finish;
    37     string filter;
    38     int depth;
    39 
    40 public:
    41     Crawler(){  filter = "";numFindUrl = 0;}
    42     ~Crawler(){}
    43     void begin();
    44     void setDepth(int depth);
    45     void processURL(string& strUrl);
    46     void addURL(string url);
    47     void log(string entry, int num);
    48     void HTMLParse(string & htmlResponse, const string & host);
    49     bool getRobotx(const string & url, char * &response, int &bytesRead);
    50     void setJdugeDomain(const string domain);
    51     long urlOtherWebsite(string url);
    52     void send(string s)
    53     {
    54         QString qs = s2q(s);
    55         emit sendMessage(qs);
    56     }
    57 signals:
    58     void sendMessage(const QString name);
    59 
    60 public slots:
    61         bool startThread();
    62 
    63 protected:
    64     void run();
    65 
    66 };
    67 #endif // CRAWLER_H
     1 #ifndef MAINWINDOW_H
     2 #define MAINWINDOW_H
     3 
     4 #include <QMainWindow>
     5 #include<QFile>
     6 #include<QMessageBox>
     7 #include<QTextStream>
     8 #include<QLineEdit>
     9 #include<QDebug>
    10 #include"crawler.h"
    11 #include"urlThread.h"
    12 
    13 namespace Ui {
    14 class MainWindow;
    15 }
    16 
    17 class MainWindow : public QMainWindow
    18 {
    19     Q_OBJECT
    20 
    21 public:
    22     explicit MainWindow(QWidget *parent = 0);
    23     ~MainWindow();
    24     void receiveMessage(const QString name);
    25 
    26 public slots:
    27         void beginGeturl();
    28         void open();
    29 
    30 private:
    31     Ui::MainWindow *ui;
    32     Crawler crawler;
    33     urlThread uth;
    34 };
    35 
    36 #endif // MAINWINDOW_H
    View Code
     1 #ifndef URLTHREAD_H
     2 #define URLTHREAD_H
     3 #include"crawler.h"
     4 
     5 class urlThread: public QThread
     6 {
     7     Q_OBJECT
     8 public slots:
     9         void startThread();
    10         void open();
    11         void send(string s)
    12         {
    13             QString qs = s2q(s);
    14             emit sendMessage(qs);
    15         }
    16 signals:
    17         void sendMessage(const QString name);
    18 protected:
    19         void run();
    20 };
    21 
    22 #endif // URLTHREAD_H
    View Code
      1 <?xml version="1.0" encoding="UTF-8"?>
      2 <ui version="4.0">
      3  <class>MainWindow</class>
      4  <widget class="QMainWindow" name="MainWindow">
      5   <property name="geometry">
      6    <rect>
      7     <x>0</x>
      8     <y>0</y>
      9     <width>706</width>
     10     <height>381</height>
     11    </rect>
     12   </property>
     13   <property name="windowTitle">
     14    <string>MainWindow</string>
     15   </property>
     16   <widget class="QWidget" name="centralWidget">
     17    <widget class="QLabel" name="label">
     18     <property name="geometry">
     19      <rect>
     20       <x>10</x>
     21       <y>20</y>
     22       <width>54</width>
     23       <height>21</height>
     24      </rect>
     25     </property>
     26     <property name="text">
     27      <string>初始网址:</string>
     28     </property>
     29    </widget>
     30    <widget class="QLineEdit" name="site">
     31     <property name="geometry">
     32      <rect>
     33       <x>70</x>
     34       <y>20</y>
     35       <width>151</width>
     36       <height>20</height>
     37      </rect>
     38     </property>
     39     <property name="text">
     40      <string>http://www.scut.edu.cn</string>
     41     </property>
     42    </widget>
     43    <widget class="QLabel" name="label_2">
     44     <property name="geometry">
     45      <rect>
     46       <x>230</x>
     47       <y>20</y>
     48       <width>54</width>
     49       <height>21</height>
     50      </rect>
     51     </property>
     52     <property name="text">
     53      <string>搜索深度:</string>
     54     </property>
     55    </widget>
     56    <widget class="QLineEdit" name="depth">
     57     <property name="geometry">
     58      <rect>
     59       <x>290</x>
     60       <y>20</y>
     61       <width>51</width>
     62       <height>20</height>
     63      </rect>
     64     </property>
     65     <property name="text">
     66      <string>2</string>
     67     </property>
     68    </widget>
     69    <widget class="QLabel" name="label_3">
     70     <property name="geometry">
     71      <rect>
     72       <x>360</x>
     73       <y>20</y>
     74       <width>71</width>
     75       <height>21</height>
     76      </rect>
     77     </property>
     78     <property name="text">
     79      <string>过滤字符串:</string>
     80     </property>
     81    </widget>
     82    <widget class="QLineEdit" name="filter">
     83     <property name="geometry">
     84      <rect>
     85       <x>430</x>
     86       <y>20</y>
     87       <width>71</width>
     88       <height>20</height>
     89      </rect>
     90     </property>
     91     <property name="text">
     92      <string>scut</string>
     93     </property>
     94    </widget>
     95    <widget class="QPushButton" name="start">
     96     <property name="geometry">
     97      <rect>
     98       <x>510</x>
     99       <y>20</y>
    100       <width>51</width>
    101       <height>23</height>
    102      </rect>
    103     </property>
    104     <property name="text">
    105      <string>开始</string>
    106     </property>
    107    </widget>
    108    <widget class="QPushButton" name="display">
    109     <property name="geometry">
    110      <rect>
    111       <x>570</x>
    112       <y>20</y>
    113       <width>61</width>
    114       <height>23</height>
    115      </rect>
    116     </property>
    117     <property name="text">
    118      <string>显示url</string>
    119     </property>
    120    </widget>
    121    <widget class="QTextBrowser" name="textBrowser">
    122     <property name="geometry">
    123      <rect>
    124       <x>10</x>
    125       <y>60</y>
    126       <width>671</width>
    127       <height>271</height>
    128      </rect>
    129     </property>
    130    </widget>
    131   </widget>
    132   <widget class="QMenuBar" name="menuBar">
    133    <property name="geometry">
    134     <rect>
    135      <x>0</x>
    136      <y>0</y>
    137      <width>706</width>
    138      <height>23</height>
    139     </rect>
    140    </property>
    141   </widget>
    142   <widget class="QToolBar" name="mainToolBar">
    143    <attribute name="toolBarArea">
    144     <enum>TopToolBarArea</enum>
    145    </attribute>
    146    <attribute name="toolBarBreak">
    147     <bool>false</bool>
    148    </attribute>
    149   </widget>
    150   <widget class="QStatusBar" name="statusBar"/>
    151  </widget>
    152  <layoutdefault spacing="6" margin="11"/>
    153  <resources/>
    154  <connections/>
    155 </ui>
    View Code

    ui文件要自己设计

    爬虫还有一些小问题,抓取的url content并不完全,某些地址还有一点小问题...

    貌似博客园没有上传附件的地方?还是我没找到?希望得到提示~

  • 相关阅读:
    [置顶] Ubuntu 12.04以上如何恢复GNOME传统界面
    Cstyle的UEFI导读之SEC第一篇 Reset Vector
    二叉排序树的实现
    【最长上升子序列】HDU 1087——Super Jumping! Jumping! Jumping!
    java中的代理
    eclipse连接远程hadoop集群开发时0700问题解决方案
    eclipse连接远程hadoop集群开发时权限不足问题解决方案
    垂死或涅槃重生 -- Delphi XE5 我们将宣布感情的回归
    百度地图 Android SDK
    MySQL第五个学习笔记 该数据表的操作
  • 原文地址:https://www.cnblogs.com/george-cw/p/3833641.html
Copyright © 2011-2022 走看看