zoukankan      html  css  js  c++  java
  • 转 HubbleDotNet PDF Word document Filter Library

    
    namespace Microshaoft
    {
        using System;
        using System.Text;
        using Microshaoft;
        public class Class1
        {
            static void Main()
            {
                string s = IFilterHelper.GetTextFromDocument
                                    (
                                        @"D:\Download\移动开发者解决方案研究报告第一期.pdf"
                                    );
                Console.WriteLine(s);
                Console.ReadLine();
            }
        }
    }
    namespace Microshaoft
    {
        ///
        /// Sample library for using IFilter to read text from any registered filter type.
        /// 
        ///  Helpful links:
        ///     http://msdn.microsoft.com/en-us/library/ms691105(VS.85).aspx
        ///     http://ifilter.codeplex.com/
        ///     http://www.pinvoke.net/default.aspx/query/LoadIFilter.html
        ///     
        ///  Code here is taken from a combination of the project located at http://ifilter.codeplex.com/
        ///  as well as definitions taken from p-invoke.net.  License is MS-PL so enjoy.
        /// 
        ///  Modify by eaglet at 2013-01-09, add convert to file method
        using System;
        using System.Diagnostics;
        using System.Text;
        using System.Runtime.InteropServices;
        using Microshaoft;
        public static class IFilterHelper
        {
            /// <summary>
            /// Utilizes IFilter interface in Windows to parse the contents of files.
            /// </summary>
            /// <param name="path">Path - Location of file to parse</param>
            /// <param name="buffer">Buffer - Return text artifacts</param>
            /// <returns>Raw set of strings from the document in plain text format.</returns>
            public static string GetTextFromDocument(string path)
            {
                int DefaultBufferSize = 4096;
                StringBuilder sb = new StringBuilder();
                IFilterAPI.IFilter filter = null;
                int hresult;
                IFilterAPI.IFilterReturnCodes rc;
                // Initialize the return buffer to 64K.
                sb = new StringBuilder(64 * 1024);
                // Try to load the filter for the path given.
                hresult = IFilterAPI.LoadIFilter(path, new IntPtr(0), ref filter);
                if (hresult == 0)
                {
                    IFilterAPI.IFILTER_FLAGS uflags;
                    // Init the filter provider.
                    rc = filter.Init
                                    (
                                        IFilterAPI.IFILTER_INIT.IFILTER_INIT_CANON_PARAGRAPHS |
                                        IFilterAPI.IFILTER_INIT.IFILTER_INIT_CANON_HYPHENS |
                                        IFilterAPI.IFILTER_INIT.IFILTER_INIT_CANON_SPACES |
                                        IFilterAPI.IFILTER_INIT.IFILTER_INIT_APPLY_INDEX_ATTRIBUTES |
                                        IFilterAPI.IFILTER_INIT.IFILTER_INIT_INDEXING_ONLY
                                        , 0
                                        , new IntPtr(0)
                                        , out uflags
                                   );
                    if (rc == IFilterAPI.IFilterReturnCodes.S_OK)
                    {
                        IFilterAPI.STAT_CHUNK statChunk;
                        // Outer loop will read chunks from the document at a time.  For those
                        // chunks that have text, the contents will be pulled and put into the
                        // return buffer.
                        bool bMoreChunks = true;
                        while (bMoreChunks)
                        {
                            rc = filter.GetChunk(out statChunk);
                            if (rc == IFilterAPI.IFilterReturnCodes.S_OK)
                            {
                                // Ignore all non-text chunks.
                                if (statChunk.flags != IFilterAPI.CHUNKSTATE.CHUNK_TEXT)
                                    continue;
                                // Check for white space items and add the appropriate breaks.
                                switch (statChunk.breakType)
                                {
                                    case IFilterAPI.CHUNK_BREAKTYPE.CHUNK_NO_BREAK:
                                        break;
                                    case IFilterAPI.CHUNK_BREAKTYPE.CHUNK_EOW:
                                        sb.Append(' ');
                                        break;
                                    case IFilterAPI.CHUNK_BREAKTYPE.CHUNK_EOC:
                                    case IFilterAPI.CHUNK_BREAKTYPE.CHUNK_EOP:
                                    case IFilterAPI.CHUNK_BREAKTYPE.CHUNK_EOS:
                                        sb.AppendLine();
                                        break;
                                }
                                // At this point we have a text chunk.  The following code will pull out
                                // all of it and add it to the buffer.
                                bool bMoreText = true;
                                while (bMoreText)
                                {
                                    // Create a temporary string buffer we can use for the parsing algorithm.
                                    int cBuffer = DefaultBufferSize;
                                    StringBuilder sbBuffer = new StringBuilder(DefaultBufferSize);
                                    // Read the next piece of data up to the size of our local buffer.
                                    rc = filter.GetText(ref cBuffer, sbBuffer);
                                    if (rc == IFilterAPI.IFilterReturnCodes.S_OK || rc == IFilterAPI.IFilterReturnCodes.FILTER_S_LAST_TEXT)
                                    {
                                        // If any data was returned, scrub it and then add it to the buffer.
                                        CleanUpCharacters(cBuffer, sbBuffer);
                                        sb.Append(sbBuffer.ToString());
                                        // If we got back some text but there is no more, terminate the loop.
                                        if (rc == IFilterAPI.IFilterReturnCodes.FILTER_S_LAST_TEXT)
                                        {
                                            bMoreText = false;
                                            break;
                                        }
                                    }
                                    // Once all data is exhausted, we are done so terminate.
                                    else if (rc == IFilterAPI.IFilterReturnCodes.FILTER_E_NO_MORE_TEXT)
                                    {
                                        bMoreText = false;
                                        break;
                                    }
                                    // Check for any fatal errors.  It is a bug if you land here.
                                    else if (rc == IFilterAPI.IFilterReturnCodes.FILTER_E_NO_TEXT)
                                    {
                                        Debug.Assert(false, "Should not get here");
                                        throw new InvalidOperationException();
                                    }
                                }
                            }
                            // Once all chunks have been read, we are done with the file.
                            else if (rc == IFilterAPI.IFilterReturnCodes.FILTER_E_END_OF_CHUNKS)
                            {
                                bMoreChunks = false;
                                break;
                            }
                            else if (rc == IFilterAPI.IFilterReturnCodes.FILTER_E_EMBEDDING_UNAVAILABLE ||
                                rc == IFilterAPI.IFilterReturnCodes.FILTER_E_LINK_UNAVAILABLE)
                            {
                                continue;
                            }
                            else
                            {
                                throw new COMException("IFilter COM error: " + rc.ToString());
                            }
                        }
                    }
                }
                else
                {
                    // If you get here there is no filter for the file type you asked for.  Throw an
                    // exception for the caller.
                    throw new InvalidOperationException("Failed to find IFilter for file " + path);
                }
                return sb.ToString();
            }
            public static void CleanUpCharacters(int chBuf, StringBuilder buf)
            {
                // The game here is to fold any "cute" versions of characters to thier 
                // simplified form to make parsing easier.
                // Truncate any extra chars that may have been writting to the buffer.
                buf.Remove(chBuf, buf.Length - chBuf);
                for (int i = 0; i < chBuf; i++)
                {
                    char ch = buf[i];
                    int chi = ch;
                    switch (chi)
                    {
                        case 0:        // embedded null
                        case 0x2000:   // en quad
                        case 0x2001:   // em quad
                        case 0x2002:   // en space
                        case 0x2003:   // em space
                        case 0x2004:   // three-per-em space
                        case 0x2005:   // four-per-em space
                        case 0x2006:   // six-per-em space
                        case 0x2007:   // figure space
                        case 0x2008:   // puctuation space
                        case 0x2009:   // thin space
                        case 0x200A:   // hair space
                        case 0x200B:   // zero-width space
                        case 0x200C:   // zero-width non-joiner
                        case 0x200D:   // zero-width joiner
                        case 0x202f:   // no-break space
                        case 0x3000:   // ideographic space
                            buf[i] = ' ';
                            break;
                        case 0x00B6:   // pilcro
                        case 0x2028:   // line seperator
                        case 0x2029:   // paragraph seperator
                            buf[i] = '\n';
                            break;
                        case 0x00AD:   // soft-hyphen
                        case 0x00B7:   // middle dot
                        case 0x2010:   // hyphen
                        case 0x2011:   // non-breaking hyphen
                        case 0x2012:   // figure dash
                        case 0x2013:   // en dash
                        case 0x2014:   // em dash
                        case 0x2015:   // quote dash
                        case 0x2027:   // hyphenation point
                        case 0x2043:   // hyphen bullet
                        case 0x208B:   // subscript minus
                        case 0xFE31:   // vertical em dash
                        case 0xFE32:   // vertical en dash
                        case 0xFE58:   // small em dash
                        case 0xFE63:   // small hyphen minus
                            buf[i] = '-';
                            break;
                        case 0x00B0:   // degree
                        case 0x2018:   // left single quote
                        case 0x2019:   // right single quote
                        case 0x201A:   // low right single quote
                        case 0x201B:   // high left single quote
                        case 0x2032:   // prime
                        case 0x2035:   // reversed prime
                        case 0x2039:   // left-pointing angle quotation mark
                        case 0x203A:   // right-pointing angle quotation mark
                            buf[i] = '\'';
                            break;
                        case 0x201C:   // left double quote
                        case 0x201D:   // right double quote
                        case 0x201E:   // low right double quote
                        case 0x201F:   // high left double quote
                        case 0x2033:   // double prime
                        case 0x2034:   // triple prime
                        case 0x2036:   // reversed double prime
                        case 0x2037:   // reversed triple prime
                        case 0x00AB:   // left-pointing double angle quotation mark
                        case 0x00BB:   // right-pointing double angle quotation mark
                        case 0x3003:   // ditto mark
                        case 0x301D:   // reversed double prime quotation mark
                        case 0x301E:   // double prime quotation mark
                        case 0x301F:   // low double prime quotation mark
                            buf[i] = '\"';
                            break;
                        case 0x00A7:   // section-sign
                        case 0x2020:   // dagger
                        case 0x2021:   // double-dagger
                        case 0x2022:   // bullet
                        case 0x2023:   // triangle bullet
                        case 0x203B:   // reference mark
                        case 0xFE55:   // small colon
                            buf[i] = ':';
                            break;
                        case 0x2024:   // one dot leader
                        case 0x2025:   // two dot leader
                        case 0x2026:   // elipsis
                        case 0x3002:   // ideographic full stop
                        case 0xFE30:   // two dot vertical leader
                        case 0xFE52:   // small full stop
                            buf[i] = '.';
                            break;
                        case 0x3001:   // ideographic comma
                        case 0xFE50:   // small comma
                        case 0xFE51:   // small ideographic comma
                            buf[i] = ',';
                            break;
                        case 0xFE54:   // small semicolon
                            buf[i] = ';';
                            break;
                        case 0x00A6:   // broken-bar
                        case 0x2016:   // double vertical line
                            buf[i] = '|';
                            break;
                        case 0x2017:   // double low line
                        case 0x203E:   // overline
                        case 0x203F:   // undertie
                        case 0x2040:   // character tie
                        case 0xFE33:   // vertical low line
                        case 0xFE49:   // dashed overline
                        case 0xFE4A:   // centerline overline
                        case 0xFE4D:   // dashed low line
                        case 0xFE4E:   // centerline low line
                            buf[i] = '_';
                            break;
                        case 0x301C:   // wave dash
                        case 0x3030:   // wavy dash
                        case 0xFE34:   // vertical wavy low line
                        case 0xFE4B:   // wavy overline
                        case 0xFE4C:   // double wavy overline
                        case 0xFE4F:   // wavy low line
                            buf[i] = '~';
                            break;
                        case 0x2038:   // caret
                        case 0x2041:   // caret insertion point
                            buf[i] =  '';
                            break;
                        case 0x2030:   // per-mille
                        case 0x2031:   // per-ten thousand
                        case 0xFE6A:   // small per-cent
                            buf[i] = '%';
                            break;
                        case 0xFE6B:   // small commercial at
                            buf[i] = '@';
                            break;
                        case 0x00A9:   // copyright
                            buf[i] = 'c';
                            break;
                        case 0x00B5:   // micro
                            buf[i] = 'u';
                            break;
                        case 0x00AE:   // registered
                            buf[i] = 'r';
                            break;
                        case 0x207A:   // superscript plus
                        case 0x208A:   // subscript plus
                        case 0xFE62:   // small plus
                            buf[i] = '+';
                            break;
                        case 0x2044:   // fraction slash
                            buf[i] = '/';
                            break;
                        case 0x2042:   // asterism
                        case 0xFE61:   // small asterisk
                            buf[i] = '*';
                            break;
                        case 0x208C:   // subscript equal
                        case 0xFE66:   // small equal
                            buf[i] = '=';
                            break;
                        case 0xFE68:   // small reverse solidus
                            buf[i] = '\\';
                            break;
                        case 0xFE5F:   // small number sign
                            buf[i] = '#';
                            break;
                        case 0xFE60:   // small ampersand
                            buf[i] = '&';
                            break;
                        case 0xFE69:   // small dollar sign
                            buf[i] = '$';
                            break;
                        case 0x2045:   // left square bracket with quill
                        case 0x3010:   // left black lenticular bracket
                        case 0x3016:   // left white lenticular bracket
                        case 0x301A:   // left white square bracket
                        case 0xFE3B:   // vertical left lenticular bracket
                        case 0xFF41:   // vertical left corner bracket
                        case 0xFF43:   // vertical white left corner bracket
                            buf[i] = '[';
                            break;
                        case 0x2046:   // right square bracket with quill
                        case 0x3011:   // right black lenticular bracket
                        case 0x3017:   // right white lenticular bracket
                        case 0x301B:   // right white square bracket
                        case 0xFE3C:   // vertical right lenticular bracket
                        case 0xFF42:   // vertical right corner bracket
                        case 0xFF44:   // vertical white right corner bracket
                            buf[i] = ']';
                            break;
                        case 0x208D:   // subscript left parenthesis
                        case 0x3014:   // left tortise-shell bracket
                        case 0x3018:   // left white tortise-shell bracket
                        case 0xFE35:   // vertical left parenthesis
                        case 0xFE39:   // vertical left tortise-shell bracket
                        case 0xFE59:   // small left parenthesis
                        case 0xFE5D:   // small left tortise-shell bracket
                            buf[i] = '(';
                            break;
                        case 0x208E:   // subscript right parenthesis
                        case 0x3015:   // right tortise-shell bracket
                        case 0x3019:   // right white tortise-shell bracket
                        case 0xFE36:   // vertical right parenthesis
                        case 0xFE3A:   // vertical right tortise-shell bracket
                        case 0xFE5A:   // small right parenthesis
                        case 0xFE5E:   // small right tortise-shell bracket
                            buf[i] = ')';
                            break;
                        case 0x3008:   // left angle bracket
                        case 0x300A:   // left double angle bracket
                        case 0xFF3D:   // vertical left double angle bracket
                        case 0xFF3F:   // vertical left angle bracket
                        case 0xFF64:   // small less-than
                            buf[i] = '<';
                            break;
                        case 0x3009:   // right angle bracket
                        case 0x300B:   // right double angle bracket
                        case 0xFF3E:   // vertical right double angle bracket
                        case 0xFF40:   // vertical right angle bracket
                        case 0xFF65:   // small greater-than
                            buf[i] = '>';
                            break;
                        case 0xFE37:   // vertical left curly bracket
                        case 0xFE5B:   // small left curly bracket
                            buf[i] = '{';
                            break;
                        case 0xFE38:   // vertical right curly bracket
                        case 0xFE5C:   // small right curly bracket
                            buf[i] = '}';
                            break;
                        case 0x00A1:   // inverted exclamation mark
                        case 0x00AC:   // not
                        case 0x203C:   // double exclamation mark
                        case 0x203D:   // interrobang
                        case 0xFE57:   // small exclamation mark
                            buf[i] = '!';
                            break;
                        case 0x00BF:   // inverted question mark
                        case 0xFE56:   // small question mark
                            buf[i] = '?';
                            break;
                        case 0x00B9:   // superscript one
                            buf[i] = '1';
                            break;
                        case 0x00B2:   // superscript two
                            buf[i] = '2';
                            break;
                        case 0x00B3:   // superscript three
                            buf[i] = '3';
                            break;
                        case 0x2070:   // superscript zero
                        case 0x2074:   // superscript four
                        case 0x2075:   // superscript five
                        case 0x2076:   // superscript six
                        case 0x2077:   // superscript seven
                        case 0x2078:   // superscript eight
                        case 0x2079:   // superscript nine
                        case 0x2080:   // subscript zero
                        case 0x2081:   // subscript one
                        case 0x2082:   // subscript two
                        case 0x2083:   // subscript three
                        case 0x2084:   // subscript four
                        case 0x2085:   // subscript five
                        case 0x2086:   // subscript six
                        case 0x2087:   // subscript seven
                        case 0x2088:   // subscript eight
                        case 0x2089:   // subscript nine
                        case 0x3021:   // Hangzhou numeral one
                        case 0x3022:   // Hangzhou numeral two
                        case 0x3023:   // Hangzhou numeral three
                        case 0x3024:   // Hangzhou numeral four
                        case 0x3025:   // Hangzhou numeral five
                        case 0x3026:   // Hangzhou numeral six
                        case 0x3027:   // Hangzhou numeral seven
                        case 0x3028:   // Hangzhou numeral eight
                        case 0x3029:   // Hangzhou numeral nine
                            chi = chi & 0x000F;
                            buf[i] = System.Convert.ToChar(chi);
                            break;
                        // ONE is at ZERO location... careful
                        case 0x3220:   // parenthesized ideograph one
                        case 0x3221:   // parenthesized ideograph two
                        case 0x3222:   // parenthesized ideograph three
                        case 0x3223:   // parenthesized ideograph four
                        case 0x3224:   // parenthesized ideograph five
                        case 0x3225:   // parenthesized ideograph six
                        case 0x3226:   // parenthesized ideograph seven
                        case 0x3227:   // parenthesized ideograph eight
                        case 0x3228:   // parenthesized ideograph nine
                        case 0x3280:   // circled ideograph one
                        case 0x3281:   // circled ideograph two
                        case 0x3282:   // circled ideograph three
                        case 0x3283:   // circled ideograph four
                        case 0x3284:   // circled ideograph five
                        case 0x3285:   // circled ideograph six
                        case 0x3286:   // circled ideograph seven
                        case 0x3287:   // circled ideograph eight
                        case 0x3288:   // circled ideograph nine
                            chi = (chi & 0x000F) + 1;
                            buf[i] = System.Convert.ToChar(chi);
                            break;
                        case 0x3007:   // ideographic number zero
                        case 0x24EA:   // circled number zero
                            buf[i] = '0';
                            break;
                        default:
                            if (0xFF01 <= ch           // fullwidth exclamation mark 
                                && ch <= 0xFF5E)       // fullwidth tilde
                            {
                                // the fullwidths line up with ASCII low subset
                                buf[i] = System.Convert.ToChar(chi & 0xFF00 + '!' - 1);
                                //ch = ch & 0xFF00 + '!' - 1;               
                            }
                            else if (0x2460 <= ch      // circled one
                                     && ch <= 0x2468)  // circled nine
                            {
                                buf[i] = System.Convert.ToChar(chi - 0x2460 + '1');
                                //ch = ch - 0x2460 + '1';
                            }
                            else if (0x2474 <= ch      // parenthesized one
                                     && ch <= 0x247C)  // parenthesized nine
                            {
                                buf[i] = Convert.ToChar(chi - 0x2474 + '1');
                                // ch = ch - 0x2474 + '1';
                            }
                            else if (0x2488 <= ch      // one full stop
                                     && ch <= 0x2490)  // nine full stop
                            {
                                buf[i] = Convert.ToChar(chi - 0x2488 + '1');
                                //ch = ch - 0x2488 + '1';
                            }
                            else if (0x249C <= ch      // parenthesized small a
                                     && ch <= 0x24B5)  // parenthesized small z
                            {
                                buf[i] = Convert.ToChar(chi - 0x249C + 'a');
                                //ch = ch - 0x249C + 'a';
                            }
                            else if (0x24B6 <= ch      // circled capital A
                                     && ch <= 0x24CF)  // circled capital Z
                            {
                                buf[i] = Convert.ToChar(chi - 0x24B6 + 'A');
                                //ch = ch - 0x24B6 + 'A';
                            }
                            else if (0x24D0 <= ch      // circled small a
                                     && ch <= 0x24E9)  // circled small z
                            {
                                buf[i] = Convert.ToChar(chi - 0x24D0 + 'a');
                                //ch = ch - 0x24D0 + 'a';
                            }
                            else if (0x2500 <= ch      // box drawing (begin)
                                     && ch <= 0x257F)  // box drawing (end)
                            {
                                buf[i] = '|';
                            }
                            else if (0x2580 <= ch      // block elements (begin)
                                     && ch <= 0x259F)  // block elements (end)
                            {
                                buf[i] = '#';
                            }
                            else if (0x25A0 <= ch      // geometric shapes (begin)
                                     && ch <= 0x25FF)  // geometric shapes (end)
                            {
                                buf[i] = '*';
                            }
                            else if (0x2600 <= ch      // dingbats (begin)
                                     && ch <= 0x267F)  // dingbats (end)
                            {
                                buf[i] = '.';
                            }
                            //else
                            //   ValidUnicode(ch);   // validate that it's legit Unicode
                            break;
                    }
                }
            }
        }
    }
    namespace Microshaoft
    {
        ///
        /// Sample library for using IFilter to read text from any registered filter type.
        /// 
        ///  Helpful links:
        ///     http://msdn.microsoft.com/en-us/library/ms691105(VS.85).aspx
        ///     http://ifilter.codeplex.com/
        ///     http://www.pinvoke.net/default.aspx/query/LoadIFilter.html
        ///     
        ///  Code here is taken from a combination of the project located at http://ifilter.codeplex.com/
        ///  as well as definitions taken from p-invoke.net.  License is MS-PL so enjoy.
        /// 
        ///  Modify by eaglet at 2013-01-09, add convert to file method
        using System;
        using System.Text;
        using System.Runtime.InteropServices;
        public static class IFilterAPI
        {
            [DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
            public static extern int LoadIFilter
                                        (
                                            string pwcsPath
                                            , [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter
                                            , ref IFilter ppIUnk
                                        );
            [ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
            [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
            public interface IFilter
            {
                /// <summary>
                /// The IFilter::Init method initializes a filtering session.
                /// </summary>
                [PreserveSig]
                IFilterReturnCodes Init
                                    (
                                            //[in] Flag settings from the IFILTER_INIT enumeration for
                                            // controlling text standardization, property output, embedding
                                            // scope, and IFilter access patterns. 
                                          IFILTER_INIT grfFlags,
                                            // [in] The size of the attributes array. When nonzero, cAttributes
                                            //  takes 
                                            // precedence over attributes specified in grfFlags. If no
                                            // attribute flags 
                                            // are specified and cAttributes is zero, the default is given by
                                            // the 
                                            // PSGUID_STORAGE storage property set, which contains the date and
                                            //  time 
                                            // of the last write to the file, size, and so on; and by the
                                            //  PID_STG_CONTENTS 
                                            // 'contents' property, which maps to the main contents of the
                                            // file. 
                                            // For more information about properties and property sets, see
                                            // Property Sets. 
                                          int cAttributes,
                                            //[in] Array of pointers to FULLPROPSPEC structures for the
                                            // requested properties. 
                                            // When cAttributes is nonzero, only the properties in aAttributes
                                            // are returned. 
                                          IntPtr aAttributes,
                                            // [out] Information about additional properties available to the
                                            //  caller; from the IFILTER_FLAGS enumeration. 
                                          out IFILTER_FLAGS pdwFlags
                                    );
                /// <summary>
                /// The IFilter::GetChunk method positions the filter at the beginning
                /// of the next chunk, 
                /// or at the first chunk if this is the first call to the GetChunk
                /// method, and returns a description of the current chunk. 
                /// </summary>
                [PreserveSig]
                IFilterReturnCodes GetChunk(out STAT_CHUNK pStat);
                /// <summary>
                /// The IFilter::GetText method retrieves text (text-type properties)
                /// from the current chunk, 
                /// which must have a CHUNKSTATE enumeration value of CHUNK_TEXT.
                /// </summary>
                [PreserveSig]
                IFilterReturnCodes GetText
                                        (
                                            // [in/out] On entry, the size of awcBuffer array in wide/Unicode
                                            // characters. On exit, the number of Unicode characters written to
                                            // awcBuffer. 
                                            // Note that this value is not the number of bytes in the buffer. 
                                            ref int pcwcBuffer,
                                            // Text retrieved from the current chunk. Do not terminate the
                                            // buffer with a character.  
                                            [Out(), MarshalAs(UnmanagedType.LPWStr)] 
                                            StringBuilder awcBuffer
                                        );
                /// <summary>
                /// The IFilter::GetValue method retrieves a value (public
                /// value-type property) from a chunk, 
                /// which must have a CHUNKSTATE enumeration value of CHUNK_VALUE.
                /// </summary>
                [PreserveSig]
                IFilterReturnCodes GetValue
                                    (
                                        // Allocate the PROPVARIANT structure with CoTaskMemAlloc. Some
                                        // PROPVARIANT 
                                        // structures contain pointers, which can be freed by calling the
                                        // PropVariantClear function. 
                                        // It is up to the caller of the GetValue method to call the
                                        //   PropVariantClear method.            
                                        // ref IntPtr ppPropValue
                                        // [MarshalAs(UnmanagedType.Struct)]
                                        ref IntPtr PropVal
                                    );
                /// <summary>
                /// The IFilter::BindRegion method retrieves an interface representing
                /// the specified portion of the object. 
                /// Currently reserved for future use.
                /// </summary>
                [PreserveSig]
                IFilterReturnCodes BindRegion
                                        (
                                            ref FILTERREGION origPos
                                            , ref Guid riid
                                            , ref object ppunk
                                        );
            }
            public struct FILTERREGION
            {
                public int idChunk;
                public int cwcStart;
                public int cwcExtent;
            }
            public enum IFilterReturnCodes : uint
            {
                /// <summary>
                /// Success
                /// </summary>
                S_OK = 0,
                /// <summary>
                /// The function was denied access to the filter file. 
                /// </summary>
                E_ACCESSDENIED = 0x80070005,
                /// <summary>
                /// The function encountered an invalid handle,
                /// probably due to a low-memory situation. 
                /// </summary>
                E_HANDLE = 0x80070006,
                /// <summary>
                /// The function received an invalid parameter.
                /// </summary>
                E_INVALIDARG = 0x80070057,
                /// <summary>
                /// Out of memory
                /// </summary>
                E_OUTOFMEMORY = 0x8007000E,
                /// <summary>
                /// Not implemented
                /// </summary>
                E_NOTIMPL = 0x80004001,
                /// <summary>
                /// Unknown error
                /// </summary>
                E_FAIL = 0x80000008,
                /// <summary>
                /// File not filtered due to password protection
                /// </summary>
                FILTER_E_PASSWORD = 0x8004170B,
                /// <summary>
                /// The document format is not recognised by the filter
                /// </summary>
                FILTER_E_UNKNOWNFORMAT = 0x8004170C,
                /// <summary>
                /// No text in current chunk
                /// </summary>
                FILTER_E_NO_TEXT = 0x80041705,
                /// <summary>
                /// No values in current chunk
                /// </summary>
                FILTER_E_NO_VALUES = 0x80041706,
                /// <summary>
                /// No more chunks of text available in object
                /// </summary>
                FILTER_E_END_OF_CHUNKS = 0x80041700,
                /// <summary>
                /// No more text available in chunk
                /// </summary>
                FILTER_E_NO_MORE_TEXT = 0x80041701,
                /// <summary>
                /// No more property values available in chunk
                /// </summary>
                FILTER_E_NO_MORE_VALUES = 0x80041702,
                /// <summary>
                /// Unable to access object
                /// </summary>
                FILTER_E_ACCESS = 0x80041703,
                /// <summary>
                /// Moniker doesn't cover entire region
                /// </summary>
                FILTER_W_MONIKER_CLIPPED = 0x00041704,
                /// <summary>
                /// Unable to bind IFilter for embedded object
                /// </summary>
                FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
                /// <summary>
                /// Unable to bind IFilter for linked object
                /// </summary>
                FILTER_E_LINK_UNAVAILABLE = 0x80041708,
                /// <summary>
                ///  This is the last text in the current chunk
                /// </summary>
                FILTER_S_LAST_TEXT = 0x00041709,
                /// <summary>
                /// This is the last value in the current chunk
                /// </summary>
                FILTER_S_LAST_VALUES = 0x0004170A
            }
            /// <summary>
            /// Flags controlling the operation of the FileFilter
            /// instance.
            /// </summary>
            [Flags]
            public enum IFILTER_INIT
            {
                IFILTER_INIT_CANON_PARAGRAPHS = 1,
                IFILTER_INIT_HARD_LINE_BREAKS = 2,
                IFILTER_INIT_CANON_HYPHENS = 4,
                IFILTER_INIT_CANON_SPACES = 8,
                IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16,
                IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 256,
                IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32,
                IFILTER_INIT_INDEXING_ONLY = 64,
                IFILTER_INIT_SEARCH_LINKS = 128,
                IFILTER_INIT_FILTER_OWNED_VALUE_OK = 512
            }
            [Flags]
            public enum IFILTER_FLAGS
            {
                IFILTER_FLAGS_OLE_PROPERTIES = 1
            }
            public struct STAT_CHUNK
            {
                public int idChunk;
                [MarshalAs(UnmanagedType.U4)]
                public CHUNK_BREAKTYPE breakType;
                [MarshalAs(UnmanagedType.U4)]
                public CHUNKSTATE flags;
                public int locale;
                public FULLPROPSPEC attribute;
                public int idChunkSource;
                public int cwcStartSource;
                public int cwcLenSource;
            }
            public enum CHUNKSTATE
            {
                CHUNK_TEXT = 0x1,
                CHUNK_VALUE = 0x2,
                CHUNK_FILTER_OWNED_VALUE = 0x4
            }
            [StructLayout(LayoutKind.Explicit)]
            public struct PROPSPEC
            {
                [FieldOffset(0)]
                public int ulKind;     // 0 - string used; 1 - PROPID
                [FieldOffset(4)]
                public int propid;
                [FieldOffset(4)]
                public IntPtr lpwstr;
            }
            public struct FULLPROPSPEC
            {
                public Guid guidPropSet;
                public PROPSPEC psProperty;
            }
            public enum CHUNK_BREAKTYPE
            {
                CHUNK_NO_BREAK = 0,
                CHUNK_EOW = 1,
                CHUNK_EOS = 2,
                CHUNK_EOP = 3,
                CHUNK_EOC = 4
            }
        }
    }
    
    
  • 相关阅读:
    MySQL "show users"
    MySQL
    A MySQL 'create table' syntax example
    MySQL backup
    MySQL show status
    Tomcat, pathinfo, and servlets
    Servlet forward example
    Servlet redirect example
    Java servlet example
    How to forward from one JSP to another JSP
  • 原文地址:https://www.cnblogs.com/Microshaoft/p/2856132.html
Copyright © 2011-2022 走看看