zoukankan      html  css  js  c++  java
  • JAVA提取字符串中所有的URL链接,并加上a标签

    工具类

    Patterns.java

      1 package com.util;
      2 
      3 import java.util.regex.Matcher;
      4 import java.util.regex.Pattern;
      5 
      6 /**
      7  * Commonly used regular expression patterns.
      8  */
      9 public class Patterns {
     10     /**
     11      *  Regular expression to match all IANA top-level domains.
     12      *  List accurate as of 2011/07/18.  List taken from:
     13      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     14      *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
     15      *
     16      *  @deprecated Due to the recent profileration of gTLDs, this API is
     17      *  expected to become out-of-date very quickly. Therefore it is now
     18      *  deprecated.
     19      */
     20     @Deprecated
     21     public static final String TOP_LEVEL_DOMAIN_STR =
     22             "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
     23                     + "|(biz|b[abdefghijmnorstvwyz])"
     24                     + "|(cat|com|coop|c[acdfghiklmnoruvxyz])"
     25                     + "|d[ejkmoz]"
     26                     + "|(edu|e[cegrstu])"
     27                     + "|f[ijkmor]"
     28                     + "|(gov|g[abdefghilmnpqrstuwy])"
     29                     + "|h[kmnrtu]"
     30                     + "|(info|int|i[delmnoqrst])"
     31                     + "|(jobs|j[emop])"
     32                     + "|k[eghimnprwyz]"
     33                     + "|l[abcikrstuvy]"
     34                     + "|(mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
     35                     + "|(name|net|n[acefgilopruz])"
     36                     + "|(org|om)"
     37                     + "|(pro|p[aefghklmnrstwy])"
     38                     + "|qa"
     39                     + "|r[eosuw]"
     40                     + "|s[abcdeghijklmnortuvyz]"
     41                     + "|(tel|travel|t[cdfghjklmnoprtvwz])"
     42                     + "|u[agksyz]"
     43                     + "|v[aceginu]"
     44                     + "|w[fs]"
     45                     + "|(u03b4u03bfu03bau03b9u03bcu03ae|u0438u0441u043fu044bu0442u0430u043du0438u0435|u0440u0444|u0441u0440u0431|u05d8u05e2u05e1u05d8|u0622u0632u0645u0627u06ccu0634u06cc|u0625u062eu062au0628u0627u0631|u0627u0644u0627u0631u062fu0646|u0627u0644u062cu0632u0627u0626u0631|u0627u0644u0633u0639u0648u062fu064au0629|u0627u0644u0645u063au0631u0628|u0627u0645u0627u0631u0627u062a|u0628u06beu0627u0631u062a|u062au0648u0646u0633|u0633u0648u0631u064au0629|u0641u0644u0633u0637u064au0646|u0642u0637u0631|u0645u0635u0631|u092au0930u0940u0915u094du0937u093e|u092du093eu0930u0924|u09adu09beu09b0u09a4|u0a2du0a3eu0a30u0a24|u0aadu0abeu0ab0u0aa4|u0b87u0ba8u0bcdu0ba4u0bbfu0bafu0bbe|u0b87u0bb2u0b99u0bcdu0b95u0bc8|u0b9au0bbfu0b99u0bcdu0b95u0baau0bcdu0baau0bc2u0bb0u0bcd|u0baau0bb0u0bbfu0b9fu0bcdu0b9au0bc8|u0c2du0c3eu0c30u0c24u0c4d|u0dbdu0d82u0d9au0dcf|u0e44u0e17u0e22|u30c6u30b9u30c8|u4e2du56fd|u4e2du570b|u53f0u6e7e|u53f0u7063|u65b0u52a0u5761|u6d4bu8bd5|u6e2cu8a66|u9999u6e2f|ud14cuc2a4ud2b8|ud55cuad6d|xn\-\-0zwm56d|xn\-\-11b5bs3a9aj6g|xn\-\-3e0b707e|xn\-\-45brj9c|xn\-\-80akhbyknj4f|xn\-\-90a3ac|xn\-\-9t4b11yi5a|xn\-\-clchc0ea0b2g2a9gcd|xn\-\-deba0ad|xn\-\-fiqs8s|xn\-\-fiqz9s|xn\-\-fpcrj9c3d|xn\-\-fzc2c9e2c|xn\-\-g6w251d|xn\-\-gecrj9c|xn\-\-h2brj9c|xn\-\-hgbk6aj7f53bba|xn\-\-hlcj6aya9esc7a|xn\-\-j6w193g|xn\-\-jxalpdlp|xn\-\-kgbechtv|xn\-\-kprw13d|xn\-\-kpry57d|xn\-\-lgbbat1ad8j|xn\-\-mgbaam7a8h|xn\-\-mgbayh7gpa|xn\-\-mgbbh1a71e|xn\-\-mgbc0a9azcg|xn\-\-mgberp4a5d4ar|xn\-\-o3cw4h|xn\-\-ogbpf8fl|xn\-\-p1ai|xn\-\-pgbs0dh|xn\-\-s9brj9c|xn\-\-wgbh1c|xn\-\-wgbl6a|xn\-\-xkc2al3hye2a|xn\-\-xkc2dl3a5ee0h|xn\-\-yfro4i67o|xn\-\-ygbi2ammx|xn\-\-zckzah|xxx)"
     46                     + "|y[et]"
     47                     + "|z[amw])";
     48 
     49     /**
     50      *  Regular expression pattern to match all IANA top-level domains.
     51      *  @deprecated This API is deprecated. See {@link #TOP_LEVEL_DOMAIN_STR}.
     52      */
     53     @Deprecated
     54     public static final Pattern TOP_LEVEL_DOMAIN =
     55             Pattern.compile(TOP_LEVEL_DOMAIN_STR);
     56 
     57     /**
     58      *  Regular expression to match all IANA top-level domains for WEB_URL.
     59      *  List accurate as of 2011/07/18.  List taken from:
     60      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
     61      *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
     62      *
     63      *  @deprecated This API is deprecated. See {@link #TOP_LEVEL_DOMAIN_STR}.
     64      */
     65     @Deprecated
     66     public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
     67             "(?:"
     68                     + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
     69                     + "|(?:biz|b[abdefghijmnorstvwyz])"
     70                     + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
     71                     + "|d[ejkmoz]"
     72                     + "|(?:edu|e[cegrstu])"
     73                     + "|f[ijkmor]"
     74                     + "|(?:gov|g[abdefghilmnpqrstuwy])"
     75                     + "|h[kmnrtu]"
     76                     + "|(?:info|int|i[delmnoqrst])"
     77                     + "|(?:jobs|j[emop])"
     78                     + "|k[eghimnprwyz]"
     79                     + "|l[abcikrstuvy]"
     80                     + "|(?:mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
     81                     + "|(?:name|net|n[acefgilopruz])"
     82                     + "|(?:org|om)"
     83                     + "|(?:pro|p[aefghklmnrstwy])"
     84                     + "|qa"
     85                     + "|r[eosuw]"
     86                     + "|s[abcdeghijklmnortuvyz]"
     87                     + "|(?:tel|travel|t[cdfghjklmnoprtvwz])"
     88                     + "|u[agksyz]"
     89                     + "|v[aceginu]"
     90                     + "|w[fs]"
     91                     + "|(?:u03b4u03bfu03bau03b9u03bcu03ae|u0438u0441u043fu044bu0442u0430u043du0438u0435|u0440u0444|u0441u0440u0431|u05d8u05e2u05e1u05d8|u0622u0632u0645u0627u06ccu0634u06cc|u0625u062eu062au0628u0627u0631|u0627u0644u0627u0631u062fu0646|u0627u0644u062cu0632u0627u0626u0631|u0627u0644u0633u0639u0648u062fu064au0629|u0627u0644u0645u063au0631u0628|u0627u0645u0627u0631u0627u062a|u0628u06beu0627u0631u062a|u062au0648u0646u0633|u0633u0648u0631u064au0629|u0641u0644u0633u0637u064au0646|u0642u0637u0631|u0645u0635u0631|u092au0930u0940u0915u094du0937u093e|u092du093eu0930u0924|u09adu09beu09b0u09a4|u0a2du0a3eu0a30u0a24|u0aadu0abeu0ab0u0aa4|u0b87u0ba8u0bcdu0ba4u0bbfu0bafu0bbe|u0b87u0bb2u0b99u0bcdu0b95u0bc8|u0b9au0bbfu0b99u0bcdu0b95u0baau0bcdu0baau0bc2u0bb0u0bcd|u0baau0bb0u0bbfu0b9fu0bcdu0b9au0bc8|u0c2du0c3eu0c30u0c24u0c4d|u0dbdu0d82u0d9au0dcf|u0e44u0e17u0e22|u30c6u30b9u30c8|u4e2du56fd|u4e2du570b|u53f0u6e7e|u53f0u7063|u65b0u52a0u5761|u6d4bu8bd5|u6e2cu8a66|u9999u6e2f|ud14cuc2a4ud2b8|ud55cuad6d|xn\-\-0zwm56d|xn\-\-11b5bs3a9aj6g|xn\-\-3e0b707e|xn\-\-45brj9c|xn\-\-80akhbyknj4f|xn\-\-90a3ac|xn\-\-9t4b11yi5a|xn\-\-clchc0ea0b2g2a9gcd|xn\-\-deba0ad|xn\-\-fiqs8s|xn\-\-fiqz9s|xn\-\-fpcrj9c3d|xn\-\-fzc2c9e2c|xn\-\-g6w251d|xn\-\-gecrj9c|xn\-\-h2brj9c|xn\-\-hgbk6aj7f53bba|xn\-\-hlcj6aya9esc7a|xn\-\-j6w193g|xn\-\-jxalpdlp|xn\-\-kgbechtv|xn\-\-kprw13d|xn\-\-kpry57d|xn\-\-lgbbat1ad8j|xn\-\-mgbaam7a8h|xn\-\-mgbayh7gpa|xn\-\-mgbbh1a71e|xn\-\-mgbc0a9azcg|xn\-\-mgberp4a5d4ar|xn\-\-o3cw4h|xn\-\-ogbpf8fl|xn\-\-p1ai|xn\-\-pgbs0dh|xn\-\-s9brj9c|xn\-\-wgbh1c|xn\-\-wgbl6a|xn\-\-xkc2al3hye2a|xn\-\-xkc2dl3a5ee0h|xn\-\-yfro4i67o|xn\-\-ygbi2ammx|xn\-\-zckzah|xxx)"
     92                     + "|y[et]"
     93                     + "|z[amw]))";
     94 
     95     /**
     96      *  Regular expression to match all IANA top-level domains.
     97      *
     98      *  List accurate as of 2015/11/24.  List taken from:
     99      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
    100      *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
    101      *
    102      *  @hide
    103      */
    104     static final String IANA_TOP_LEVEL_DOMAINS =
    105             "(?:"
    106                     + "(?:aaa|aarp|abb|abbott|abogado|academy|accenture|accountant|accountants|aco|active"
    107                     + "|actor|ads|adult|aeg|aero|afl|agency|aig|airforce|airtel|allfinanz|alsace|amica|amsterdam"
    108                     + "|android|apartments|app|apple|aquarelle|aramco|archi|army|arpa|arte|asia|associates"
    109                     + "|attorney|auction|audio|auto|autos|axa|azure|a[cdefgilmoqrstuwxz])"
    110                     + "|(?:band|bank|bar|barcelona|barclaycard|barclays|bargains|bauhaus|bayern|bbc|bbva"
    111                     + "|bcn|beats|beer|bentley|berlin|best|bet|bharti|bible|bid|bike|bing|bingo|bio|biz|black"
    112                     + "|blackfriday|bloomberg|blue|bms|bmw|bnl|bnpparibas|boats|bom|bond|boo|boots|boutique"
    113                     + "|bradesco|bridgestone|broadway|broker|brother|brussels|budapest|build|builders|business"
    114                     + "|buzz|bzh|b[abdefghijmnorstvwyz])"
    115                     + "|(?:cab|cafe|cal|camera|camp|cancerresearch|canon|capetown|capital|car|caravan|cards"
    116                     + "|care|career|careers|cars|cartier|casa|cash|casino|cat|catering|cba|cbn|ceb|center|ceo"
    117                     + "|cern|cfa|cfd|chanel|channel|chat|cheap|chloe|christmas|chrome|church|cipriani|cisco"
    118                     + "|citic|city|cityeats|claims|cleaning|click|clinic|clothing|cloud|club|clubmed|coach"
    119                     + "|codes|coffee|college|cologne|com|commbank|community|company|computer|comsec|condos"
    120                     + "|construction|consulting|contractors|cooking|cool|coop|corsica|country|coupons|courses"
    121                     + "|credit|creditcard|creditunion|cricket|crown|crs|cruises|csc|cuisinella|cymru|cyou|c[acdfghiklmnoruvwxyz])"
    122                     + "|(?:dabur|dad|dance|date|dating|datsun|day|dclk|deals|degree|delivery|dell|delta"
    123                     + "|democrat|dental|dentist|desi|design|dev|diamonds|diet|digital|direct|directory|discount"
    124                     + "|dnp|docs|dog|doha|domains|doosan|download|drive|durban|dvag|d[ejkmoz])"
    125                     + "|(?:earth|eat|edu|education|email|emerck|energy|engineer|engineering|enterprises"
    126                     + "|epson|equipment|erni|esq|estate|eurovision|eus|events|everbank|exchange|expert|exposed"
    127                     + "|express|e[cegrstu])"
    128                     + "|(?:fage|fail|fairwinds|faith|family|fan|fans|farm|fashion|feedback|ferrero|film"
    129                     + "|final|finance|financial|firmdale|fish|fishing|fit|fitness|flights|florist|flowers|flsmidth"
    130                     + "|fly|foo|football|forex|forsale|forum|foundation|frl|frogans|fund|furniture|futbol|fyi"
    131                     + "|f[ijkmor])"
    132                     + "|(?:gal|gallery|game|garden|gbiz|gdn|gea|gent|genting|ggee|gift|gifts|gives|giving"
    133                     + "|glass|gle|global|globo|gmail|gmo|gmx|gold|goldpoint|golf|goo|goog|google|gop|gov|grainger"
    134                     + "|graphics|gratis|green|gripe|group|gucci|guge|guide|guitars|guru|g[abdefghilmnpqrstuwy])"
    135                     + "|(?:hamburg|hangout|haus|healthcare|help|here|hermes|hiphop|hitachi|hiv|hockey|holdings"
    136                     + "|holiday|homedepot|homes|honda|horse|host|hosting|hoteles|hotmail|house|how|hsbc|hyundai"
    137                     + "|h[kmnrtu])"
    138                     + "|(?:ibm|icbc|ice|icu|ifm|iinet|immo|immobilien|industries|infiniti|info|ing|ink|institute"
    139                     + "|insure|int|international|investments|ipiranga|irish|ist|istanbul|itau|iwc|i[delmnoqrst])"
    140                     + "|(?:jaguar|java|jcb|jetzt|jewelry|jlc|jll|jobs|joburg|jprs|juegos|j[emop])"
    141                     + "|(?:kaufen|kddi|kia|kim|kinder|kitchen|kiwi|koeln|komatsu|krd|kred|kyoto|k[eghimnprwyz])"
    142                     + "|(?:lacaixa|lancaster|land|landrover|lasalle|lat|latrobe|law|lawyer|lds|lease|leclerc"
    143                     + "|legal|lexus|lgbt|liaison|lidl|life|lifestyle|lighting|limited|limo|linde|link|live"
    144                     + "|lixil|loan|loans|lol|london|lotte|lotto|love|ltd|ltda|lupin|luxe|luxury|l[abcikrstuvy])"
    145                     + "|(?:madrid|maif|maison|man|management|mango|market|marketing|markets|marriott|mba"
    146                     + "|media|meet|melbourne|meme|memorial|men|menu|meo|miami|microsoft|mil|mini|mma|mobi|moda"
    147                     + "|moe|moi|mom|monash|money|montblanc|mormon|mortgage|moscow|motorcycles|mov|movie|movistar"
    148                     + "|mtn|mtpc|mtr|museum|mutuelle|m[acdeghklmnopqrstuvwxyz])"
    149                     + "|(?:nadex|nagoya|name|navy|nec|net|netbank|network|neustar|new|news|nexus|ngo|nhk"
    150                     + "|nico|ninja|nissan|nokia|nra|nrw|ntt|nyc|n[acefgilopruz])"
    151                     + "|(?:obi|office|okinawa|omega|one|ong|onl|online|ooo|oracle|orange|org|organic|osaka"
    152                     + "|otsuka|ovh|om)"
    153                     + "|(?:page|panerai|paris|partners|parts|party|pet|pharmacy|philips|photo|photography"
    154                     + "|photos|physio|piaget|pics|pictet|pictures|ping|pink|pizza|place|play|playstation|plumbing"
    155                     + "|plus|pohl|poker|porn|post|praxi|press|pro|prod|productions|prof|properties|property"
    156                     + "|protection|pub|p[aefghklmnrstwy])"
    157                     + "|(?:qpon|quebec|qa)"
    158                     + "|(?:racing|realtor|realty|recipes|red|redstone|rehab|reise|reisen|reit|ren|rent|rentals"
    159                     + "|repair|report|republican|rest|restaurant|review|reviews|rich|ricoh|rio|rip|rocher|rocks"
    160                     + "|rodeo|rsvp|ruhr|run|rwe|ryukyu|r[eosuw])"
    161                     + "|(?:saarland|sakura|sale|samsung|sandvik|sandvikcoromant|sanofi|sap|sapo|sarl|saxo"
    162                     + "|sbs|sca|scb|schmidt|scholarships|school|schule|schwarz|science|scor|scot|seat|security"
    163                     + "|seek|sener|services|seven|sew|sex|sexy|shiksha|shoes|show|shriram|singles|site|ski"
    164                     + "|sky|skype|sncf|soccer|social|software|sohu|solar|solutions|sony|soy|space|spiegel|spreadbetting"
    165                     + "|srl|stada|starhub|statoil|stc|stcgroup|stockholm|studio|study|style|sucks|supplies"
    166                     + "|supply|support|surf|surgery|suzuki|swatch|swiss|sydney|systems|s[abcdeghijklmnortuvxyz])"
    167                     + "|(?:tab|taipei|tatamotors|tatar|tattoo|tax|taxi|team|tech|technology|tel|telefonica"
    168                     + "|temasek|tennis|thd|theater|theatre|tickets|tienda|tips|tires|tirol|today|tokyo|tools"
    169                     + "|top|toray|toshiba|tours|town|toyota|toys|trade|trading|training|travel|trust|tui|t[cdfghjklmnortvwz])"
    170                     + "|(?:ubs|university|uno|uol|u[agksyz])"
    171                     + "|(?:vacations|vana|vegas|ventures|versicherung|vet|viajes|video|villas|vin|virgin"
    172                     + "|vision|vista|vistaprint|viva|vlaanderen|vodka|vote|voting|voto|voyage|v[aceginu])"
    173                     + "|(?:wales|walter|wang|watch|webcam|website|wed|wedding|weir|whoswho|wien|wiki|williamhill"
    174                     + "|win|windows|wine|wme|work|works|world|wtc|wtf|w[fs])"
    175                     + "|(?:u03b5u03bb|u0431u0435u043b|u0434u0435u0442u0438|u043au043eu043c|u043cu043au0434"
    176                     + "|u043cu043eu043d|u043cu043eu0441u043au0432u0430|u043eu043du043bu0430u0439u043d"
    177                     + "|u043eu0440u0433|u0440u0443u0441|u0440u0444|u0441u0430u0439u0442|u0441u0440u0431"
    178                     + "|u0443u043au0440|u049bu0430u0437|u0570u0561u0575|u05e7u05d5u05dd|u0627u0631u0627u0645u0643u0648"
    179                     + "|u0627u0644u0627u0631u062fu0646|u0627u0644u062cu0632u0627u0626u0631|u0627u0644u0633u0639u0648u062fu064au0629"
    180                     + "|u0627u0644u0645u063au0631u0628|u0627u0645u0627u0631u0627u062a|u0627u06ccu0631u0627u0646"
    181                     + "|u0628u0627u0632u0627u0631|u0628u06beu0627u0631u062a|u062au0648u0646u0633"
    182                     + "|u0633u0648u062fu0627u0646|u0633u0648u0631u064au0629|u0634u0628u0643u0629"
    183                     + "|u0639u0631u0627u0642|u0639u0645u0627u0646|u0641u0644u0633u0637u064au0646"
    184                     + "|u0642u0637u0631|u0643u0648u0645|u0645u0635u0631|u0645u0644u064au0633u064au0627"
    185                     + "|u0645u0648u0642u0639|u0915u0949u092e|u0928u0947u091f|u092du093eu0930u0924"
    186                     + "|u0938u0902u0917u0920u0928|u09adu09beu09b0u09a4|u0a2du0a3eu0a30u0a24|u0aadu0abeu0ab0u0aa4"
    187                     + "|u0b87u0ba8u0bcdu0ba4u0bbfu0bafu0bbe|u0b87u0bb2u0b99u0bcdu0b95u0bc8|u0b9au0bbfu0b99u0bcdu0b95u0baau0bcdu0baau0bc2u0bb0u0bcd"
    188                     + "|u0c2du0c3eu0c30u0c24u0c4d|u0dbdu0d82u0d9au0dcf|u0e04u0e2du0e21|u0e44u0e17u0e22"
    189                     + "|u10d2u10d4|u307fu3093u306a|u30b0u30fcu30b0u30eb|u30b3u30e0|u4e16u754c"
    190                     + "|u4e2du4fe1|u4e2du56fd|u4e2du570b|u4e2du6587u7f51|u4f01u4e1a|u4f5bu5c71"
    191                     + "|u4fe1u606f|u5065u5eb7|u516bu5366|u516cu53f8|u516cu76ca|u53f0u6e7e|u53f0u7063"
    192                     + "|u5546u57ce|u5546u5e97|u5546u6807|u5728u7ebf|u5927u62ff|u5a31u4e50|u5de5u884c"
    193                     + "|u5e7fu4e1c|u6148u5584|u6211u7231u4f60|u624bu673a|u653fu52a1|u653fu5e9c"
    194                     + "|u65b0u52a0u5761|u65b0u95fb|u65f6u5c1a|u673au6784|u6de1u9a6cu9521|u6e38u620f"
    195                     + "|u70b9u770b|u79fbu52a8|u7ec4u7ec7u673au6784|u7f51u5740|u7f51u5e97|u7f51u7edc"
    196                     + "|u8c37u6b4c|u96c6u56e2|u98deu5229u6d66|u9910u5385|u9999u6e2f|ub2f7ub137"
    197                     + "|ub2f7ucef4|uc0bcuc131|ud55cuad6d|xbox"
    198                     + "|xerox|xin|xn\-\-11b4c3d|xn\-\-1qqw23a|xn\-\-30rr7y|xn\-\-3bst00m|xn\-\-3ds443g"
    199                     + "|xn\-\-3e0b707e|xn\-\-3pxu8k|xn\-\-42c2d9a|xn\-\-45brj9c|xn\-\-45q11c|xn\-\-4gbrim"
    200                     + "|xn\-\-55qw42g|xn\-\-55qx5d|xn\-\-6frz82g|xn\-\-6qq986b3xl|xn\-\-80adxhks"
    201                     + "|xn\-\-80ao21a|xn\-\-80asehdb|xn\-\-80aswg|xn\-\-90a3ac|xn\-\-90ais|xn\-\-9dbq2a"
    202                     + "|xn\-\-9et52u|xn\-\-b4w605ferd|xn\-\-c1avg|xn\-\-c2br7g|xn\-\-cg4bki|xn\-\-clchc0ea0b2g2a9gcd"
    203                     + "|xn\-\-czr694b|xn\-\-czrs0t|xn\-\-czru2d|xn\-\-d1acj3b|xn\-\-d1alf|xn\-\-efvy88h"
    204                     + "|xn\-\-estv75g|xn\-\-fhbei|xn\-\-fiq228c5hs|xn\-\-fiq64b|xn\-\-fiqs8s|xn\-\-fiqz9s"
    205                     + "|xn\-\-fjq720a|xn\-\-flw351e|xn\-\-fpcrj9c3d|xn\-\-fzc2c9e2c|xn\-\-gecrj9c"
    206                     + "|xn\-\-h2brj9c|xn\-\-hxt814e|xn\-\-i1b6b1a6a2e|xn\-\-imr513n|xn\-\-io0a7i"
    207                     + "|xn\-\-j1aef|xn\-\-j1amh|xn\-\-j6w193g|xn\-\-kcrx77d1x4a|xn\-\-kprw13d|xn\-\-kpry57d"
    208                     + "|xn\-\-kput3i|xn\-\-l1acc|xn\-\-lgbbat1ad8j|xn\-\-mgb9awbf|xn\-\-mgba3a3ejt"
    209                     + "|xn\-\-mgba3a4f16a|xn\-\-mgbaam7a8h|xn\-\-mgbab2bd|xn\-\-mgbayh7gpa|xn\-\-mgbbh1a71e"
    210                     + "|xn\-\-mgbc0a9azcg|xn\-\-mgberp4a5d4ar|xn\-\-mgbpl2fh|xn\-\-mgbtx2b|xn\-\-mgbx4cd0ab"
    211                     + "|xn\-\-mk1bu44c|xn\-\-mxtq1m|xn\-\-ngbc5azd|xn\-\-node|xn\-\-nqv7f|xn\-\-nqv7fs00ema"
    212                     + "|xn\-\-nyqy26a|xn\-\-o3cw4h|xn\-\-ogbpf8fl|xn\-\-p1acf|xn\-\-p1ai|xn\-\-pgbs0dh"
    213                     + "|xn\-\-pssy2u|xn\-\-q9jyb4c|xn\-\-qcka1pmc|xn\-\-qxam|xn\-\-rhqv96g|xn\-\-s9brj9c"
    214                     + "|xn\-\-ses554g|xn\-\-t60b56a|xn\-\-tckwe|xn\-\-unup4y|xn\-\-vermgensberater\-ctb"
    215                     + "|xn\-\-vermgensberatung\-pwb|xn\-\-vhquv|xn\-\-vuq861b|xn\-\-wgbh1c|xn\-\-wgbl6a"
    216                     + "|xn\-\-xhq521b|xn\-\-xkc2al3hye2a|xn\-\-xkc2dl3a5ee0h|xn\-\-y9a3aq|xn\-\-yfro4i67o"
    217                     + "|xn\-\-ygbi2ammx|xn\-\-zfr164b|xperia|xxx|xyz)"
    218                     + "|(?:yachts|yamaxun|yandex|yodobashi|yoga|yokohama|youtube|y[et])"
    219                     + "|(?:zara|zip|zone|zuerich|z[amw]))";
    220 
    221     /**
    222      * Kept for backward compatibility reasons.
    223      *
    224      * @deprecated Deprecated since it does not include all IRI characters defined in RFC 3987
    225      */
    226     @Deprecated
    227     public static final String GOOD_IRI_CHAR =
    228             "a-zA-Z0-9u00A0-uD7FFuF900-uFDCFuFDF0-uFFEF";
    229 
    230     public static final Pattern IP_ADDRESS
    231             = Pattern.compile(
    232             "((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\.(25[0-5]|2[0-4]"
    233                     + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]"
    234                     + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
    235                     + "|[1-9][0-9]|[0-9]))");
    236 
    237     /**
    238      * Valid UCS characters defined in RFC 3987. Excludes space characters.
    239      */
    240     private static final String UCS_CHAR = "[" +
    241             "u00A0-uD7FF" +
    242             "uF900-uFDCF" +
    243             "uFDF0-uFFEF" +
    244             "uD800uDC00-uD83FuDFFD" +
    245             "uD840uDC00-uD87FuDFFD" +
    246             "uD880uDC00-uD8BFuDFFD" +
    247             "uD8C0uDC00-uD8FFuDFFD" +
    248             "uD900uDC00-uD93FuDFFD" +
    249             "uD940uDC00-uD97FuDFFD" +
    250             "uD980uDC00-uD9BFuDFFD" +
    251             "uD9C0uDC00-uD9FFuDFFD" +
    252             "uDA00uDC00-uDA3FuDFFD" +
    253             "uDA40uDC00-uDA7FuDFFD" +
    254             "uDA80uDC00-uDABFuDFFD" +
    255             "uDAC0uDC00-uDAFFuDFFD" +
    256             "uDB00uDC00-uDB3FuDFFD" +
    257             "uDB44uDC00-uDB7FuDFFD" +
    258             "&&[^u00A0[u2000-u200A]u2028u2029u202Fu3000]]";
    259 
    260     /**
    261      * Valid characters for IRI label defined in RFC 3987.
    262      */
    263     private static final String LABEL_CHAR = "a-zA-Z0-9" + UCS_CHAR;
    264 
    265     /**
    266      * Valid characters for IRI TLD defined in RFC 3987.
    267      */
    268     private static final String TLD_CHAR = "a-zA-Z" + UCS_CHAR;
    269 
    270     /**
    271      * RFC 1035 Section 2.3.4 limits the labels to a maximum 63 octets.
    272      */
    273     private static final String IRI_LABEL =
    274             "[" + LABEL_CHAR + "](?:[" + LABEL_CHAR + "\-]{0,61}[" + LABEL_CHAR + "]){0,1}";
    275 
    276     /**
    277      * RFC 3492 references RFC 1034 and limits Punycode algorithm output to 63 characters.
    278      */
    279     private static final String PUNYCODE_TLD = "xn\-\-[\w\-]{0,58}\w";
    280 
    281     private static final String TLD = "(" + PUNYCODE_TLD + "|" + "[" + TLD_CHAR + "]{2,63}" +")";
    282 
    283     private static final String HOST_NAME = "(" + IRI_LABEL + "\.)+" + TLD;
    284 
    285     public static final Pattern DOMAIN_NAME
    286             = Pattern.compile("(" + HOST_NAME + "|" + IP_ADDRESS + ")");
    287 
    288     private static final String PROTOCOL = "(?i:http|https|rtsp):\/\/";
    289 
    290     /* A word boundary or end of input.  This is to stop foo.sure from matching as foo.su */
    291     private static final String WORD_BOUNDARY = "(?:\b|$|^)";
    292 
    293     private static final String USER_INFO = "(?:[a-zA-Z0-9\$\-\_\.\+\!\*\'\(\)"
    294             + "\,\;\?\&\=]|(?:\%[a-fA-F0-9]{2})){1,64}(?:\:(?:[a-zA-Z0-9\$\-\_"
    295             + "\.\+\!\*\'\(\)\,\;\?\&\=]|(?:\%[a-fA-F0-9]{2})){1,25})?\@";
    296 
    297     private static final String PORT_NUMBER = "\:\d{1,5}";
    298 
    299     private static final String PATH_AND_QUERY = "\/(?:(?:[" + LABEL_CHAR
    300             + "\;\/\?\:\@\&\=\#\~"  // plus optional query params
    301             + "\-\.\+\!\*\'\(\)\,\_])|(?:\%[a-fA-F0-9]{2}))*";
    302 
    303     /**
    304      *  Regular expression pattern to match most part of RFC 3987
    305      *  Internationalized URLs, aka IRIs.
    306      */
    307     public static final Pattern WEB_URL = Pattern.compile("("
    308             + "("
    309             + "(?:" + PROTOCOL + "(?:" + USER_INFO + ")?" + ")?"
    310             + "(?:" + DOMAIN_NAME + ")"
    311             + "(?:" + PORT_NUMBER + ")?"
    312             + ")"
    313             + "(" + PATH_AND_QUERY + ")?"
    314             + WORD_BOUNDARY
    315             + ")");
    316 
    317     /**
    318      * Regular expression that matches known TLDs and punycode TLDs
    319      */
    320     private static final String STRICT_TLD = "(?:" +
    321             IANA_TOP_LEVEL_DOMAINS + "|" + PUNYCODE_TLD + ")";
    322 
    323     /**
    324      * Regular expression that matches host names using {@link #STRICT_TLD}
    325      */
    326     private static final String STRICT_HOST_NAME = "(?:(?:" + IRI_LABEL + "\.)+"
    327             + STRICT_TLD + ")";
    328 
    329     /**
    330      * Regular expression that matches domain names using either {@link #STRICT_HOST_NAME} or
    331      * {@link #IP_ADDRESS}
    332      */
    333     private static final Pattern STRICT_DOMAIN_NAME
    334             = Pattern.compile("(?:" + STRICT_HOST_NAME + "|" + IP_ADDRESS + ")");
    335 
    336     /**
    337      * Regular expression that matches domain names without a TLD
    338      */
    339     private static final String RELAXED_DOMAIN_NAME =
    340             "(?:" + "(?:" + IRI_LABEL + "(?:\.(?=\S))" +"?)+" + "|" + IP_ADDRESS + ")";
    341 
    342     /**
    343      * Regular expression to match strings that do not start with a supported protocol. The TLDs
    344      * are expected to be one of the known TLDs.
    345      */
    346     private static final String WEB_URL_WITHOUT_PROTOCOL = "("
    347             + WORD_BOUNDARY
    348             + "(?<!:\/\/)"
    349             + "("
    350             + "(?:" + STRICT_DOMAIN_NAME + ")"
    351             + "(?:" + PORT_NUMBER + ")?"
    352             + ")"
    353             + "(?:" + PATH_AND_QUERY + ")?"
    354             + WORD_BOUNDARY
    355             + ")";
    356 
    357     /**
    358      * Regular expression to match strings that start with a supported protocol. Rules for domain
    359      * names and TLDs are more relaxed. TLDs are optional.
    360      */
    361     private static final String WEB_URL_WITH_PROTOCOL = "("
    362             + WORD_BOUNDARY
    363             + "(?:"
    364             + "(?:" + PROTOCOL + "(?:" + USER_INFO + ")?" + ")"
    365             + "(?:" + RELAXED_DOMAIN_NAME + ")?"
    366             + "(?:" + PORT_NUMBER + ")?"
    367             + ")"
    368             + "(?:" + PATH_AND_QUERY + ")?"
    369             + WORD_BOUNDARY
    370             + ")";
    371 
    372     /**
    373      * Regular expression pattern to match IRIs. If a string starts with http(s):// the expression
    374      * tries to match the URL structure with a relaxed rule for TLDs. If the string does not start
    375      * with http(s):// the TLDs are expected to be one of the known TLDs.
    376      *
    377      * @hide
    378      */
    379     public static final Pattern AUTOLINK_WEB_URL = Pattern.compile(
    380             "(" + WEB_URL_WITH_PROTOCOL + "|" + WEB_URL_WITHOUT_PROTOCOL + ")");
    381 
    382     /**
    383      * Regular expression for valid email characters. Does not include some of the valid characters
    384      * defined in RFC5321: #&~!^`{}/=$*?|
    385      */
    386     private static final String EMAIL_CHAR = LABEL_CHAR + "\+\-_%'";
    387 
    388     /**
    389      * Regular expression for local part of an email address. RFC5321 section 4.5.3.1.1 limits
    390      * the local part to be at most 64 octets.
    391      */
    392     private static final String EMAIL_ADDRESS_LOCAL_PART =
    393             "[" + EMAIL_CHAR + "]" + "(?:[" + EMAIL_CHAR + "\.]{1,62}[" + EMAIL_CHAR + "])?";
    394 
    395     /**
    396      * Regular expression for the domain part of an email address. RFC5321 section 4.5.3.1.2 limits
    397      * the domain to be at most 255 octets.
    398      */
    399     private static final String EMAIL_ADDRESS_DOMAIN =
    400             "(?=.{1,255}(?:\s|$|^))" + HOST_NAME;
    401 
    402     /**
    403      * Regular expression pattern to match email addresses. It excludes double quoted local parts
    404      * and the special characters #&~!^`{}/=$*?| that are included in RFC5321.
    405      * @hide
    406      */
    407     public static final Pattern AUTOLINK_EMAIL_ADDRESS = Pattern.compile("(" + WORD_BOUNDARY +
    408             "(?:" + EMAIL_ADDRESS_LOCAL_PART + "@" + EMAIL_ADDRESS_DOMAIN + ")" +
    409             WORD_BOUNDARY + ")"
    410     );
    411 
    412     public static final Pattern EMAIL_ADDRESS
    413             = Pattern.compile(
    414             "[a-zA-Z0-9\+\.\_\%\-\+]{1,256}" +
    415                     "\@" +
    416                     "[a-zA-Z0-9][a-zA-Z0-9\-]{0,64}" +
    417                     "(" +
    418                     "\." +
    419                     "[a-zA-Z0-9][a-zA-Z0-9\-]{0,25}" +
    420                     ")+"
    421     );
    422 
    423     /**
    424      * This pattern is intended for searching for things that look like they
    425      * might be phone numbers in arbitrary text, not for validating whether
    426      * something is in fact a phone number.  It will miss many things that
    427      * are legitimate phone numbers.
    428      *
    429      * <p> The pattern matches the following:
    430      * <ul>
    431      * <li>Optionally, a + sign followed immediately by one or more digits. Spaces, dots, or dashes
    432      * may follow.
    433      * <li>Optionally, sets of digits in parentheses, separated by spaces, dots, or dashes.
    434      * <li>A string starting and ending with a digit, containing digits, spaces, dots, and/or dashes.
    435      * </ul>
    436      */
    437     public static final Pattern PHONE
    438             = Pattern.compile(                      // sdd = space, dot, or dash
    439             "(\+[0-9]+[\- \.]*)?"        // +<digits><sdd>*
    440                     + "(\([0-9]+\)[\- \.]*)?"   // (<digits>)<sdd>*
    441                     + "([0-9][0-9\- \.]+[0-9])"); // <digit><digit|sdd>+<digit>
    442 
    443     /**
    444      *  Convenience method to take all of the non-null matching groups in a
    445      *  regex Matcher and return them as a concatenated string.
    446      *
    447      *  @param matcher      The Matcher object from which grouped text will
    448      *                      be extracted
    449      *
    450      *  @return             A String comprising all of the non-null matched
    451      *                      groups concatenated together
    452      */
    453     public static final String concatGroups(Matcher matcher) {
    454         StringBuilder b = new StringBuilder();
    455         final int numGroups = matcher.groupCount();
    456 
    457         for (int i = 1; i <= numGroups; i++) {
    458             String s = matcher.group(i);
    459 
    460             if (s != null) {
    461                 b.append(s);
    462             }
    463         }
    464 
    465         return b.toString();
    466     }
    467 
    468     /**
    469      * Convenience method to return only the digits and plus signs
    470      * in the matching string.
    471      *
    472      * @param matcher      The Matcher object from which digits and plus will
    473      *                     be extracted
    474      *
    475      * @return             A String comprising all of the digits and plus in
    476      *                     the match
    477      */
    478     public static final String digitsAndPlusOnly(Matcher matcher) {
    479         StringBuilder buffer = new StringBuilder();
    480         String matchingRegion = matcher.group();
    481 
    482         for (int i = 0, size = matchingRegion.length(); i < size; i++) {
    483             char character = matchingRegion.charAt(i);
    484 
    485             if (character == '+' || Character.isDigit(character)) {
    486                 buffer.append(character);
    487             }
    488         }
    489         return buffer.toString();
    490     }
    491 
    492     /**
    493      * Do not create this static utility class.
    494      */
    495     private Patterns() {}
    496 }


    调用方法

           Matcher matcher = Patterns.WEB_URL.matcher(message);
            while (matcher.find()) {
                //循环输出所有匹配到的链接,并加上链接
                String link = matcher.group();
                String restr = "<a href='" + link + "' target='_blank'>" + link + "</a>";
                message = message.replaceAll(link, restr);
            }
    

      

    说明:

      1、message为要进行提取的字符串

      2、http/https可以获取,但是在链接结尾不能和其他文字连在一起,不然获取不准确

      即:"你真的是https://www.cnblogs.com/pxblog博客网"会获取到"https://www.cnblogs.com/pxblog博客网"

          "你真的是https://www.cnblogs.com/pxblog 博客网"会获取到"https://www.cnblogs.com/pxblog"

  • 相关阅读:
    【HDOJ5971】Wrestling Match(二分图,并查集)
    【HDOJ5978】To begin or not to begin(概率)
    【HDOJ5979】Convex(三角函数)
    【HDOJ5980】Find Small A(签到)
    【HDOJ5949】Relative atomic mass(签到)
    【HDOJ5948】Thickest Burger(签到)
    【HDOJ6228】Tree(树)
    【HDOJ6227】Rabbits(贪心)
    147.命题逻辑
    146.离散数学
  • 原文地址:https://www.cnblogs.com/pxblog/p/12610060.html
Copyright © 2011-2022 走看看