zoukankan      html  css  js  c++  java
  • 转 HubbleDotNet PDF Word document Filter Library

    
    namespace Microshaoft
    {
        using System;
        using System.Text;
        using Microshaoft;
        public class Class1
        {
            static void Main()
            {
                string s = IFilterHelper.GetTextFromDocument
                                    (
                                        @"D:\Download\移动开发者解决方案研究报告第一期.pdf"
                                    );
                Console.WriteLine(s);
                Console.ReadLine();
            }
        }
    }
    namespace Microshaoft
    {
        ///
        /// Sample library for using IFilter to read text from any registered filter type.
        /// 
        ///  Helpful links:
        ///     http://msdn.microsoft.com/en-us/library/ms691105(VS.85).aspx
        ///     http://ifilter.codeplex.com/
        ///     http://www.pinvoke.net/default.aspx/query/LoadIFilter.html
        ///     
        ///  Code here is taken from a combination of the project located at http://ifilter.codeplex.com/
        ///  as well as definitions taken from p-invoke.net.  License is MS-PL so enjoy.
        /// 
        ///  Modify by eaglet at 2013-01-09, add convert to file method
        using System;
        using System.Diagnostics;
        using System.Text;
        using System.Runtime.InteropServices;
        using Microshaoft;
        public static class IFilterHelper
        {
            /// <summary>
            /// Utilizes IFilter interface in Windows to parse the contents of files.
            /// </summary>
            /// <param name="path">Path - Location of file to parse</param>
            /// <param name="buffer">Buffer - Return text artifacts</param>
            /// <returns>Raw set of strings from the document in plain text format.</returns>
            public static string GetTextFromDocument(string path)
            {
                int DefaultBufferSize = 4096;
                StringBuilder sb = new StringBuilder();
                IFilterAPI.IFilter filter = null;
                int hresult;
                IFilterAPI.IFilterReturnCodes rc;
                // Initialize the return buffer to 64K.
                sb = new StringBuilder(64 * 1024);
                // Try to load the filter for the path given.
                hresult = IFilterAPI.LoadIFilter(path, new IntPtr(0), ref filter);
                if (hresult == 0)
                {
                    IFilterAPI.IFILTER_FLAGS uflags;
                    // Init the filter provider.
                    rc = filter.Init
                                    (
                                        IFilterAPI.IFILTER_INIT.IFILTER_INIT_CANON_PARAGRAPHS |
                                        IFilterAPI.IFILTER_INIT.IFILTER_INIT_CANON_HYPHENS |
                                        IFilterAPI.IFILTER_INIT.IFILTER_INIT_CANON_SPACES |
                                        IFilterAPI.IFILTER_INIT.IFILTER_INIT_APPLY_INDEX_ATTRIBUTES |
                                        IFilterAPI.IFILTER_INIT.IFILTER_INIT_INDEXING_ONLY
                                        , 0
                                        , new IntPtr(0)
                                        , out uflags
                                   );
                    if (rc == IFilterAPI.IFilterReturnCodes.S_OK)
                    {
                        IFilterAPI.STAT_CHUNK statChunk;
                        // Outer loop will read chunks from the document at a time.  For those
                        // chunks that have text, the contents will be pulled and put into the
                        // return buffer.
                        bool bMoreChunks = true;
                        while (bMoreChunks)
                        {
                            rc = filter.GetChunk(out statChunk);
                            if (rc == IFilterAPI.IFilterReturnCodes.S_OK)
                            {
                                // Ignore all non-text chunks.
                                if (statChunk.flags != IFilterAPI.CHUNKSTATE.CHUNK_TEXT)
                                    continue;
                                // Check for white space items and add the appropriate breaks.
                                switch (statChunk.breakType)
                                {
                                    case IFilterAPI.CHUNK_BREAKTYPE.CHUNK_NO_BREAK:
                                        break;
                                    case IFilterAPI.CHUNK_BREAKTYPE.CHUNK_EOW:
                                        sb.Append(' ');
                                        break;
                                    case IFilterAPI.CHUNK_BREAKTYPE.CHUNK_EOC:
                                    case IFilterAPI.CHUNK_BREAKTYPE.CHUNK_EOP:
                                    case IFilterAPI.CHUNK_BREAKTYPE.CHUNK_EOS:
                                        sb.AppendLine();
                                        break;
                                }
                                // At this point we have a text chunk.  The following code will pull out
                                // all of it and add it to the buffer.
                                bool bMoreText = true;
                                while (bMoreText)
                                {
                                    // Create a temporary string buffer we can use for the parsing algorithm.
                                    int cBuffer = DefaultBufferSize;
                                    StringBuilder sbBuffer = new StringBuilder(DefaultBufferSize);
                                    // Read the next piece of data up to the size of our local buffer.
                                    rc = filter.GetText(ref cBuffer, sbBuffer);
                                    if (rc == IFilterAPI.IFilterReturnCodes.S_OK || rc == IFilterAPI.IFilterReturnCodes.FILTER_S_LAST_TEXT)
                                    {
                                        // If any data was returned, scrub it and then add it to the buffer.
                                        CleanUpCharacters(cBuffer, sbBuffer);
                                        sb.Append(sbBuffer.ToString());
                                        // If we got back some text but there is no more, terminate the loop.
                                        if (rc == IFilterAPI.IFilterReturnCodes.FILTER_S_LAST_TEXT)
                                        {
                                            bMoreText = false;
                                            break;
                                        }
                                    }
                                    // Once all data is exhausted, we are done so terminate.
                                    else if (rc == IFilterAPI.IFilterReturnCodes.FILTER_E_NO_MORE_TEXT)
                                    {
                                        bMoreText = false;
                                        break;
                                    }
                                    // Check for any fatal errors.  It is a bug if you land here.
                                    else if (rc == IFilterAPI.IFilterReturnCodes.FILTER_E_NO_TEXT)
                                    {
                                        Debug.Assert(false, "Should not get here");
                                        throw new InvalidOperationException();
                                    }
                                }
                            }
                            // Once all chunks have been read, we are done with the file.
                            else if (rc == IFilterAPI.IFilterReturnCodes.FILTER_E_END_OF_CHUNKS)
                            {
                                bMoreChunks = false;
                                break;
                            }
                            else if (rc == IFilterAPI.IFilterReturnCodes.FILTER_E_EMBEDDING_UNAVAILABLE ||
                                rc == IFilterAPI.IFilterReturnCodes.FILTER_E_LINK_UNAVAILABLE)
                            {
                                continue;
                            }
                            else
                            {
                                throw new COMException("IFilter COM error: " + rc.ToString());
                            }
                        }
                    }
                }
                else
                {
                    // If you get here there is no filter for the file type you asked for.  Throw an
                    // exception for the caller.
                    throw new InvalidOperationException("Failed to find IFilter for file " + path);
                }
                return sb.ToString();
            }
            public static void CleanUpCharacters(int chBuf, StringBuilder buf)
            {
                // The game here is to fold any "cute" versions of characters to thier 
                // simplified form to make parsing easier.
                // Truncate any extra chars that may have been writting to the buffer.
                buf.Remove(chBuf, buf.Length - chBuf);
                for (int i = 0; i < chBuf; i++)
                {
                    char ch = buf[i];
                    int chi = ch;
                    switch (chi)
                    {
                        case 0:        // embedded null
                        case 0x2000:   // en quad
                        case 0x2001:   // em quad
                        case 0x2002:   // en space
                        case 0x2003:   // em space
                        case 0x2004:   // three-per-em space
                        case 0x2005:   // four-per-em space
                        case 0x2006:   // six-per-em space
                        case 0x2007:   // figure space
                        case 0x2008:   // puctuation space
                        case 0x2009:   // thin space
                        case 0x200A:   // hair space
                        case 0x200B:   // zero-width space
                        case 0x200C:   // zero-width non-joiner
                        case 0x200D:   // zero-width joiner
                        case 0x202f:   // no-break space
                        case 0x3000:   // ideographic space
                            buf[i] = ' ';
                            break;
                        case 0x00B6:   // pilcro
                        case 0x2028:   // line seperator
                        case 0x2029:   // paragraph seperator
                            buf[i] = '\n';
                            break;
                        case 0x00AD:   // soft-hyphen
                        case 0x00B7:   // middle dot
                        case 0x2010:   // hyphen
                        case 0x2011:   // non-breaking hyphen
                        case 0x2012:   // figure dash
                        case 0x2013:   // en dash
                        case 0x2014:   // em dash
                        case 0x2015:   // quote dash
                        case 0x2027:   // hyphenation point
                        case 0x2043:   // hyphen bullet
                        case 0x208B:   // subscript minus
                        case 0xFE31:   // vertical em dash
                        case 0xFE32:   // vertical en dash
                        case 0xFE58:   // small em dash
                        case 0xFE63:   // small hyphen minus
                            buf[i] = '-';
                            break;
                        case 0x00B0:   // degree
                        case 0x2018:   // left single quote
                        case 0x2019:   // right single quote
                        case 0x201A:   // low right single quote
                        case 0x201B:   // high left single quote
                        case 0x2032:   // prime
                        case 0x2035:   // reversed prime
                        case 0x2039:   // left-pointing angle quotation mark
                        case 0x203A:   // right-pointing angle quotation mark
                            buf[i] = '\'';
                            break;
                        case 0x201C:   // left double quote
                        case 0x201D:   // right double quote
                        case 0x201E:   // low right double quote
                        case 0x201F:   // high left double quote
                        case 0x2033:   // double prime
                        case 0x2034:   // triple prime
                        case 0x2036:   // reversed double prime
                        case 0x2037:   // reversed triple prime
                        case 0x00AB:   // left-pointing double angle quotation mark
                        case 0x00BB:   // right-pointing double angle quotation mark
                        case 0x3003:   // ditto mark
                        case 0x301D:   // reversed double prime quotation mark
                        case 0x301E:   // double prime quotation mark
                        case 0x301F:   // low double prime quotation mark
                            buf[i] = '\"';
                            break;
                        case 0x00A7:   // section-sign
                        case 0x2020:   // dagger
                        case 0x2021:   // double-dagger
                        case 0x2022:   // bullet
                        case 0x2023:   // triangle bullet
                        case 0x203B:   // reference mark
                        case 0xFE55:   // small colon
                            buf[i] = ':';
                            break;
                        case 0x2024:   // one dot leader
                        case 0x2025:   // two dot leader
                        case 0x2026:   // elipsis
                        case 0x3002:   // ideographic full stop
                        case 0xFE30:   // two dot vertical leader
                        case 0xFE52:   // small full stop
                            buf[i] = '.';
                            break;
                        case 0x3001:   // ideographic comma
                        case 0xFE50:   // small comma
                        case 0xFE51:   // small ideographic comma
                            buf[i] = ',';
                            break;
                        case 0xFE54:   // small semicolon
                            buf[i] = ';';
                            break;
                        case 0x00A6:   // broken-bar
                        case 0x2016:   // double vertical line
                            buf[i] = '|';
                            break;
                        case 0x2017:   // double low line
                        case 0x203E:   // overline
                        case 0x203F:   // undertie
                        case 0x2040:   // character tie
                        case 0xFE33:   // vertical low line
                        case 0xFE49:   // dashed overline
                        case 0xFE4A:   // centerline overline
                        case 0xFE4D:   // dashed low line
                        case 0xFE4E:   // centerline low line
                            buf[i] = '_';
                            break;
                        case 0x301C:   // wave dash
                        case 0x3030:   // wavy dash
                        case 0xFE34:   // vertical wavy low line
                        case 0xFE4B:   // wavy overline
                        case 0xFE4C:   // double wavy overline
                        case 0xFE4F:   // wavy low line
                            buf[i] = '~';
                            break;
                        case 0x2038:   // caret
                        case 0x2041:   // caret insertion point
                            buf[i] =  '';
                            break;
                        case 0x2030:   // per-mille
                        case 0x2031:   // per-ten thousand
                        case 0xFE6A:   // small per-cent
                            buf[i] = '%';
                            break;
                        case 0xFE6B:   // small commercial at
                            buf[i] = '@';
                            break;
                        case 0x00A9:   // copyright
                            buf[i] = 'c';
                            break;
                        case 0x00B5:   // micro
                            buf[i] = 'u';
                            break;
                        case 0x00AE:   // registered
                            buf[i] = 'r';
                            break;
                        case 0x207A:   // superscript plus
                        case 0x208A:   // subscript plus
                        case 0xFE62:   // small plus
                            buf[i] = '+';
                            break;
                        case 0x2044:   // fraction slash
                            buf[i] = '/';
                            break;
                        case 0x2042:   // asterism
                        case 0xFE61:   // small asterisk
                            buf[i] = '*';
                            break;
                        case 0x208C:   // subscript equal
                        case 0xFE66:   // small equal
                            buf[i] = '=';
                            break;
                        case 0xFE68:   // small reverse solidus
                            buf[i] = '\\';
                            break;
                        case 0xFE5F:   // small number sign
                            buf[i] = '#';
                            break;
                        case 0xFE60:   // small ampersand
                            buf[i] = '&';
                            break;
                        case 0xFE69:   // small dollar sign
                            buf[i] = '$';
                            break;
                        case 0x2045:   // left square bracket with quill
                        case 0x3010:   // left black lenticular bracket
                        case 0x3016:   // left white lenticular bracket
                        case 0x301A:   // left white square bracket
                        case 0xFE3B:   // vertical left lenticular bracket
                        case 0xFF41:   // vertical left corner bracket
                        case 0xFF43:   // vertical white left corner bracket
                            buf[i] = '[';
                            break;
                        case 0x2046:   // right square bracket with quill
                        case 0x3011:   // right black lenticular bracket
                        case 0x3017:   // right white lenticular bracket
                        case 0x301B:   // right white square bracket
                        case 0xFE3C:   // vertical right lenticular bracket
                        case 0xFF42:   // vertical right corner bracket
                        case 0xFF44:   // vertical white right corner bracket
                            buf[i] = ']';
                            break;
                        case 0x208D:   // subscript left parenthesis
                        case 0x3014:   // left tortise-shell bracket
                        case 0x3018:   // left white tortise-shell bracket
                        case 0xFE35:   // vertical left parenthesis
                        case 0xFE39:   // vertical left tortise-shell bracket
                        case 0xFE59:   // small left parenthesis
                        case 0xFE5D:   // small left tortise-shell bracket
                            buf[i] = '(';
                            break;
                        case 0x208E:   // subscript right parenthesis
                        case 0x3015:   // right tortise-shell bracket
                        case 0x3019:   // right white tortise-shell bracket
                        case 0xFE36:   // vertical right parenthesis
                        case 0xFE3A:   // vertical right tortise-shell bracket
                        case 0xFE5A:   // small right parenthesis
                        case 0xFE5E:   // small right tortise-shell bracket
                            buf[i] = ')';
                            break;
                        case 0x3008:   // left angle bracket
                        case 0x300A:   // left double angle bracket
                        case 0xFF3D:   // vertical left double angle bracket
                        case 0xFF3F:   // vertical left angle bracket
                        case 0xFF64:   // small less-than
                            buf[i] = '<';
                            break;
                        case 0x3009:   // right angle bracket
                        case 0x300B:   // right double angle bracket
                        case 0xFF3E:   // vertical right double angle bracket
                        case 0xFF40:   // vertical right angle bracket
                        case 0xFF65:   // small greater-than
                            buf[i] = '>';
                            break;
                        case 0xFE37:   // vertical left curly bracket
                        case 0xFE5B:   // small left curly bracket
                            buf[i] = '{';
                            break;
                        case 0xFE38:   // vertical right curly bracket
                        case 0xFE5C:   // small right curly bracket
                            buf[i] = '}';
                            break;
                        case 0x00A1:   // inverted exclamation mark
                        case 0x00AC:   // not
                        case 0x203C:   // double exclamation mark
                        case 0x203D:   // interrobang
                        case 0xFE57:   // small exclamation mark
                            buf[i] = '!';
                            break;
                        case 0x00BF:   // inverted question mark
                        case 0xFE56:   // small question mark
                            buf[i] = '?';
                            break;
                        case 0x00B9:   // superscript one
                            buf[i] = '1';
                            break;
                        case 0x00B2:   // superscript two
                            buf[i] = '2';
                            break;
                        case 0x00B3:   // superscript three
                            buf[i] = '3';
                            break;
                        case 0x2070:   // superscript zero
                        case 0x2074:   // superscript four
                        case 0x2075:   // superscript five
                        case 0x2076:   // superscript six
                        case 0x2077:   // superscript seven
                        case 0x2078:   // superscript eight
                        case 0x2079:   // superscript nine
                        case 0x2080:   // subscript zero
                        case 0x2081:   // subscript one
                        case 0x2082:   // subscript two
                        case 0x2083:   // subscript three
                        case 0x2084:   // subscript four
                        case 0x2085:   // subscript five
                        case 0x2086:   // subscript six
                        case 0x2087:   // subscript seven
                        case 0x2088:   // subscript eight
                        case 0x2089:   // subscript nine
                        case 0x3021:   // Hangzhou numeral one
                        case 0x3022:   // Hangzhou numeral two
                        case 0x3023:   // Hangzhou numeral three
                        case 0x3024:   // Hangzhou numeral four
                        case 0x3025:   // Hangzhou numeral five
                        case 0x3026:   // Hangzhou numeral six
                        case 0x3027:   // Hangzhou numeral seven
                        case 0x3028:   // Hangzhou numeral eight
                        case 0x3029:   // Hangzhou numeral nine
                            chi = chi & 0x000F;
                            buf[i] = System.Convert.ToChar(chi);
                            break;
                        // ONE is at ZERO location... careful
                        case 0x3220:   // parenthesized ideograph one
                        case 0x3221:   // parenthesized ideograph two
                        case 0x3222:   // parenthesized ideograph three
                        case 0x3223:   // parenthesized ideograph four
                        case 0x3224:   // parenthesized ideograph five
                        case 0x3225:   // parenthesized ideograph six
                        case 0x3226:   // parenthesized ideograph seven
                        case 0x3227:   // parenthesized ideograph eight
                        case 0x3228:   // parenthesized ideograph nine
                        case 0x3280:   // circled ideograph one
                        case 0x3281:   // circled ideograph two
                        case 0x3282:   // circled ideograph three
                        case 0x3283:   // circled ideograph four
                        case 0x3284:   // circled ideograph five
                        case 0x3285:   // circled ideograph six
                        case 0x3286:   // circled ideograph seven
                        case 0x3287:   // circled ideograph eight
                        case 0x3288:   // circled ideograph nine
                            chi = (chi & 0x000F) + 1;
                            buf[i] = System.Convert.ToChar(chi);
                            break;
                        case 0x3007:   // ideographic number zero
                        case 0x24EA:   // circled number zero
                            buf[i] = '0';
                            break;
                        default:
                            if (0xFF01 <= ch           // fullwidth exclamation mark 
                                && ch <= 0xFF5E)       // fullwidth tilde
                            {
                                // the fullwidths line up with ASCII low subset
                                buf[i] = System.Convert.ToChar(chi & 0xFF00 + '!' - 1);
                                //ch = ch & 0xFF00 + '!' - 1;               
                            }
                            else if (0x2460 <= ch      // circled one
                                     && ch <= 0x2468)  // circled nine
                            {
                                buf[i] = System.Convert.ToChar(chi - 0x2460 + '1');
                                //ch = ch - 0x2460 + '1';
                            }
                            else if (0x2474 <= ch      // parenthesized one
                                     && ch <= 0x247C)  // parenthesized nine
                            {
                                buf[i] = Convert.ToChar(chi - 0x2474 + '1');
                                // ch = ch - 0x2474 + '1';
                            }
                            else if (0x2488 <= ch      // one full stop
                                     && ch <= 0x2490)  // nine full stop
                            {
                                buf[i] = Convert.ToChar(chi - 0x2488 + '1');
                                //ch = ch - 0x2488 + '1';
                            }
                            else if (0x249C <= ch      // parenthesized small a
                                     && ch <= 0x24B5)  // parenthesized small z
                            {
                                buf[i] = Convert.ToChar(chi - 0x249C + 'a');
                                //ch = ch - 0x249C + 'a';
                            }
                            else if (0x24B6 <= ch      // circled capital A
                                     && ch <= 0x24CF)  // circled capital Z
                            {
                                buf[i] = Convert.ToChar(chi - 0x24B6 + 'A');
                                //ch = ch - 0x24B6 + 'A';
                            }
                            else if (0x24D0 <= ch      // circled small a
                                     && ch <= 0x24E9)  // circled small z
                            {
                                buf[i] = Convert.ToChar(chi - 0x24D0 + 'a');
                                //ch = ch - 0x24D0 + 'a';
                            }
                            else if (0x2500 <= ch      // box drawing (begin)
                                     && ch <= 0x257F)  // box drawing (end)
                            {
                                buf[i] = '|';
                            }
                            else if (0x2580 <= ch      // block elements (begin)
                                     && ch <= 0x259F)  // block elements (end)
                            {
                                buf[i] = '#';
                            }
                            else if (0x25A0 <= ch      // geometric shapes (begin)
                                     && ch <= 0x25FF)  // geometric shapes (end)
                            {
                                buf[i] = '*';
                            }
                            else if (0x2600 <= ch      // dingbats (begin)
                                     && ch <= 0x267F)  // dingbats (end)
                            {
                                buf[i] = '.';
                            }
                            //else
                            //   ValidUnicode(ch);   // validate that it's legit Unicode
                            break;
                    }
                }
            }
        }
    }
    namespace Microshaoft
    {
        ///
        /// Sample library for using IFilter to read text from any registered filter type.
        /// 
        ///  Helpful links:
        ///     http://msdn.microsoft.com/en-us/library/ms691105(VS.85).aspx
        ///     http://ifilter.codeplex.com/
        ///     http://www.pinvoke.net/default.aspx/query/LoadIFilter.html
        ///     
        ///  Code here is taken from a combination of the project located at http://ifilter.codeplex.com/
        ///  as well as definitions taken from p-invoke.net.  License is MS-PL so enjoy.
        /// 
        ///  Modify by eaglet at 2013-01-09, add convert to file method
        using System;
        using System.Text;
        using System.Runtime.InteropServices;
        public static class IFilterAPI
        {
            [DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
            public static extern int LoadIFilter
                                        (
                                            string pwcsPath
                                            , [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter
                                            , ref IFilter ppIUnk
                                        );
            [ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
            [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
            public interface IFilter
            {
                /// <summary>
                /// The IFilter::Init method initializes a filtering session.
                /// </summary>
                [PreserveSig]
                IFilterReturnCodes Init
                                    (
                                            //[in] Flag settings from the IFILTER_INIT enumeration for
                                            // controlling text standardization, property output, embedding
                                            // scope, and IFilter access patterns. 
                                          IFILTER_INIT grfFlags,
                                            // [in] The size of the attributes array. When nonzero, cAttributes
                                            //  takes 
                                            // precedence over attributes specified in grfFlags. If no
                                            // attribute flags 
                                            // are specified and cAttributes is zero, the default is given by
                                            // the 
                                            // PSGUID_STORAGE storage property set, which contains the date and
                                            //  time 
                                            // of the last write to the file, size, and so on; and by the
                                            //  PID_STG_CONTENTS 
                                            // 'contents' property, which maps to the main contents of the
                                            // file. 
                                            // For more information about properties and property sets, see
                                            // Property Sets. 
                                          int cAttributes,
                                            //[in] Array of pointers to FULLPROPSPEC structures for the
                                            // requested properties. 
                                            // When cAttributes is nonzero, only the properties in aAttributes
                                            // are returned. 
                                          IntPtr aAttributes,
                                            // [out] Information about additional properties available to the
                                            //  caller; from the IFILTER_FLAGS enumeration. 
                                          out IFILTER_FLAGS pdwFlags
                                    );
                /// <summary>
                /// The IFilter::GetChunk method positions the filter at the beginning
                /// of the next chunk, 
                /// or at the first chunk if this is the first call to the GetChunk
                /// method, and returns a description of the current chunk. 
                /// </summary>
                [PreserveSig]
                IFilterReturnCodes GetChunk(out STAT_CHUNK pStat);
                /// <summary>
                /// The IFilter::GetText method retrieves text (text-type properties)
                /// from the current chunk, 
                /// which must have a CHUNKSTATE enumeration value of CHUNK_TEXT.
                /// </summary>
                [PreserveSig]
                IFilterReturnCodes GetText
                                        (
                                            // [in/out] On entry, the size of awcBuffer array in wide/Unicode
                                            // characters. On exit, the number of Unicode characters written to
                                            // awcBuffer. 
                                            // Note that this value is not the number of bytes in the buffer. 
                                            ref int pcwcBuffer,
                                            // Text retrieved from the current chunk. Do not terminate the
                                            // buffer with a character.  
                                            [Out(), MarshalAs(UnmanagedType.LPWStr)] 
                                            StringBuilder awcBuffer
                                        );
                /// <summary>
                /// The IFilter::GetValue method retrieves a value (public
                /// value-type property) from a chunk, 
                /// which must have a CHUNKSTATE enumeration value of CHUNK_VALUE.
                /// </summary>
                [PreserveSig]
                IFilterReturnCodes GetValue
                                    (
                                        // Allocate the PROPVARIANT structure with CoTaskMemAlloc. Some
                                        // PROPVARIANT 
                                        // structures contain pointers, which can be freed by calling the
                                        // PropVariantClear function. 
                                        // It is up to the caller of the GetValue method to call the
                                        //   PropVariantClear method.            
                                        // ref IntPtr ppPropValue
                                        // [MarshalAs(UnmanagedType.Struct)]
                                        ref IntPtr PropVal
                                    );
                /// <summary>
                /// The IFilter::BindRegion method retrieves an interface representing
                /// the specified portion of the object. 
                /// Currently reserved for future use.
                /// </summary>
                [PreserveSig]
                IFilterReturnCodes BindRegion
                                        (
                                            ref FILTERREGION origPos
                                            , ref Guid riid
                                            , ref object ppunk
                                        );
            }
            public struct FILTERREGION
            {
                public int idChunk;
                public int cwcStart;
                public int cwcExtent;
            }
            public enum IFilterReturnCodes : uint
            {
                /// <summary>
                /// Success
                /// </summary>
                S_OK = 0,
                /// <summary>
                /// The function was denied access to the filter file. 
                /// </summary>
                E_ACCESSDENIED = 0x80070005,
                /// <summary>
                /// The function encountered an invalid handle,
                /// probably due to a low-memory situation. 
                /// </summary>
                E_HANDLE = 0x80070006,
                /// <summary>
                /// The function received an invalid parameter.
                /// </summary>
                E_INVALIDARG = 0x80070057,
                /// <summary>
                /// Out of memory
                /// </summary>
                E_OUTOFMEMORY = 0x8007000E,
                /// <summary>
                /// Not implemented
                /// </summary>
                E_NOTIMPL = 0x80004001,
                /// <summary>
                /// Unknown error
                /// </summary>
                E_FAIL = 0x80000008,
                /// <summary>
                /// File not filtered due to password protection
                /// </summary>
                FILTER_E_PASSWORD = 0x8004170B,
                /// <summary>
                /// The document format is not recognised by the filter
                /// </summary>
                FILTER_E_UNKNOWNFORMAT = 0x8004170C,
                /// <summary>
                /// No text in current chunk
                /// </summary>
                FILTER_E_NO_TEXT = 0x80041705,
                /// <summary>
                /// No values in current chunk
                /// </summary>
                FILTER_E_NO_VALUES = 0x80041706,
                /// <summary>
                /// No more chunks of text available in object
                /// </summary>
                FILTER_E_END_OF_CHUNKS = 0x80041700,
                /// <summary>
                /// No more text available in chunk
                /// </summary>
                FILTER_E_NO_MORE_TEXT = 0x80041701,
                /// <summary>
                /// No more property values available in chunk
                /// </summary>
                FILTER_E_NO_MORE_VALUES = 0x80041702,
                /// <summary>
                /// Unable to access object
                /// </summary>
                FILTER_E_ACCESS = 0x80041703,
                /// <summary>
                /// Moniker doesn't cover entire region
                /// </summary>
                FILTER_W_MONIKER_CLIPPED = 0x00041704,
                /// <summary>
                /// Unable to bind IFilter for embedded object
                /// </summary>
                FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
                /// <summary>
                /// Unable to bind IFilter for linked object
                /// </summary>
                FILTER_E_LINK_UNAVAILABLE = 0x80041708,
                /// <summary>
                ///  This is the last text in the current chunk
                /// </summary>
                FILTER_S_LAST_TEXT = 0x00041709,
                /// <summary>
                /// This is the last value in the current chunk
                /// </summary>
                FILTER_S_LAST_VALUES = 0x0004170A
            }
            /// <summary>
            /// Flags controlling the operation of the FileFilter
            /// instance.
            /// </summary>
            [Flags]
            public enum IFILTER_INIT
            {
                IFILTER_INIT_CANON_PARAGRAPHS = 1,
                IFILTER_INIT_HARD_LINE_BREAKS = 2,
                IFILTER_INIT_CANON_HYPHENS = 4,
                IFILTER_INIT_CANON_SPACES = 8,
                IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16,
                IFILTER_INIT_APPLY_CRAWL_ATTRIBUTES = 256,
                IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32,
                IFILTER_INIT_INDEXING_ONLY = 64,
                IFILTER_INIT_SEARCH_LINKS = 128,
                IFILTER_INIT_FILTER_OWNED_VALUE_OK = 512
            }
            [Flags]
            public enum IFILTER_FLAGS
            {
                IFILTER_FLAGS_OLE_PROPERTIES = 1
            }
            public struct STAT_CHUNK
            {
                public int idChunk;
                [MarshalAs(UnmanagedType.U4)]
                public CHUNK_BREAKTYPE breakType;
                [MarshalAs(UnmanagedType.U4)]
                public CHUNKSTATE flags;
                public int locale;
                public FULLPROPSPEC attribute;
                public int idChunkSource;
                public int cwcStartSource;
                public int cwcLenSource;
            }
            public enum CHUNKSTATE
            {
                CHUNK_TEXT = 0x1,
                CHUNK_VALUE = 0x2,
                CHUNK_FILTER_OWNED_VALUE = 0x4
            }
            [StructLayout(LayoutKind.Explicit)]
            public struct PROPSPEC
            {
                [FieldOffset(0)]
                public int ulKind;     // 0 - string used; 1 - PROPID
                [FieldOffset(4)]
                public int propid;
                [FieldOffset(4)]
                public IntPtr lpwstr;
            }
            public struct FULLPROPSPEC
            {
                public Guid guidPropSet;
                public PROPSPEC psProperty;
            }
            public enum CHUNK_BREAKTYPE
            {
                CHUNK_NO_BREAK = 0,
                CHUNK_EOW = 1,
                CHUNK_EOS = 2,
                CHUNK_EOP = 3,
                CHUNK_EOC = 4
            }
        }
    }
    
    
  • 相关阅读:
    未能加载文件或程序集BUG系列
    寄语
    65. Valid Number
    56. Merge Intervals
    sublime text3 anaconda 插件报错
    42. Trapping Rain Water
    windows 下win+r无效
    93. Restore IP Addresses
    32. Longest Valid Parentheses
    48 Rotate Image
  • 原文地址:https://www.cnblogs.com/Microshaoft/p/2856132.html
Copyright © 2011-2022 走看看