zoukankan      html  css  js  c++  java
  • data filter 去掉HTML文件中的所有标记

    编写一个C++程序来读取文件,过滤掉所有的标记,将过滤掉标记后的内容输出到一个新文件中。

    1. 从文件中读取一个字符

    2. 确定字符是否是HTML标记的一部分

    3. 打印出所有不是HTML标记的字符

    /* --------------------------------------------
     * This program reads a html file, and writes
     * the text without the tags to a new file.
     * --------------------------------------------*/
    
    #include <iostream> // Required for cin, cout, cerr
    #include <fstream>  // Required for ifstream, ofstream
    #include <string>   // Required for string
    #include <cstdlib>  // Required for exit
    
    using namespace std;
    
    int main()
    {
        // Declare objects
        char ch;
        bool text_state(true);
        string infile, outfile;
        ifstream html;
        ofstream htmltext;
    
        // Prompt user for name of input file
        cout << "Enter the name of the input file : 
    ( *.*, such as : demo.html ) 
    " ;
        cout << "Make sure the file is under current project file ! 
    " ;
        // My English is poor ~~
        cin >> infile;
    
        cout<< "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    " ;
        // Prompt user for name of output file
        cout << "Enter the name of the output file :  " ;
        cin >> outfile;
    
        // Open files
        html.open(infile.c_str());
        if(html.fail())
        {
            cout<< "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    " ;
            cerr << "Error opening input file" << endl ;
            exit(1);
        }
        htmltext.open(outfile.c_str());
    
        // Read first character from html file
        html.get(ch);
    
        while(!html.eof())
        {
            // Check state
            if(text_state)
            {
                if(ch == '<')
                    // Beginning of a tag
                    text_state = false;
                    // Change states
    
                else
                    htmltext << ch;
                    // Still text, write to the file
            }
            else
            {
                // Command state, no output required
                if(ch == '>')
                // End of tag
                    text_state = true;
                // Change states
            }
    
            // Read next character from html file
            html.get(ch);
        }
        html.close();
        htmltext.close();
        cout<< "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    " ;
        cout << "Success transformed ! 
    " ;
        cout << "Look for " << outfile << " in current file.
    " ;
        cout<< "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    " ;
    
        return 0;
    }
    

    之后就可以拿个HTML文件试试了,不过这个程序只是把所有标记过滤掉,还有待完善。如果非标记字符有很多无关内容,效果就差强人意。建议用典型的HTML文件测试,如:

    <html>
    
    <head>
    <title>我的第一个 HTML 页面</title>
    </head>
    
    <body>
    <p>body 元素的内容会显示在浏览器中。</p>
    <p>title 元素的内容会显示在浏览器的标题栏中。</p>
    </body>
    
    </html>
    


  • 相关阅读:
    描述软件质量属性需求和质量属性场景(以淘宝网为例)
    软件架构初识
    机器学习实践02
    机器学习实践01
    机器学习十讲02
    用户故事与敏捷方法阅读笔记05
    机器学习十讲01
    用户故事与敏捷方法阅读笔记04
    tensorflow学习05(Mnist数据集)
    spring boot发送HTTP请求
  • 原文地址:https://www.cnblogs.com/Genesis2018/p/8304749.html
Copyright © 2011-2022 走看看