zoukankan      html  css  js  c++  java
  • 移除重复字符的几个算法简单比较

       1:      class Program
       2:      {
       3:          static void Main(string[] args)
       4:          {
       5:              string s = File.ReadAllText(@"e:	est.txt");
       6:              Program p = new Program();
       7:              string r = p.RemoveDuplicatedChar_1(s);
       8:              r = p.RemoveDuplicatedChar_2(s);
       9:              r = p.RemoveDuplicatedChar_6(s);
      10:          }
      11:   
      12:          /// <summary>
      13:          /// 既然是C#,首先应该想到的是利用类库现有的实现
      14:          /// HashSet是理智的选择,要注意这是3.5才有的类
      15:          /// </summary>
      16:          public string RemoveDuplicatedChar_1(string s)
      17:          {
      18:              if (string.IsNullOrEmpty(s) || s.Length < 2)
      19:              {
      20:                  return s;
      21:              }
      22:   
      23:              HashSet<char> hs = new HashSet<char>(s.ToCharArray());
      24:              return new string(hs.ToArray());
      25:          }
      26:   
      27:          /// <summary>
      28:          /// Distinct函数也可以做到
      29:          /// </summary>
      30:          public string RemoveDuplicatedChar_2(string s)
      31:          {
      32:              if (string.IsNullOrEmpty(s) || s.Length < 2)
      33:              {
      34:                  return s;
      35:              }
      36:        
      37:              return new string(s.Distinct().ToArray());
      38:          }
      39:   
      40:          //Distinct的实现使用到了链表和哈希,感兴趣的可以参考一下实现
      41:          /****
      42:                  [__DynamicallyInvokable]
      43:                  public static IEnumerable<TSource> Distinct<TSource>(this IEnumerable<TSource> source)
      44:                  {
      45:                      if (source == null)
      46:                      {
      47:                          throw Error.ArgumentNull("source");
      48:                      }
      49:                      return DistinctIterator<TSource>(source, null);
      50:                  }
      51:          
      52:                  private static IEnumerable<TSource> DistinctIterator<TSource>(IEnumerable<TSource> source, IEqualityComparer<TSource> comparer)
      53:                  {
      54:                      Set<TSource> iteratorVariable0 = new Set<TSource>(comparer);
      55:                      foreach (TSource iteratorVariable1 in source)
      56:                      {
      57:                          if (iteratorVariable0.Add(iteratorVariable1))
      58:                          {
      59:                              yield return iteratorVariable1;
      60:                          }
      61:                      }
      62:                  }
      63:           
      64:                  public bool Add(TElement value)
      65:                  {
      66:                      return !this.Find(value, true);
      67:                  }
      68:  
      69:                  private bool Find(TElement value, bool add)
      70:                  {
      71:                      int hashCode = this.InternalGetHashCode(value);
      72:                      for (int i = this.buckets[hashCode % this.buckets.Length] - 1; i >= 0; i = this.slots[i].next)
      73:                      {
      74:                          if ((this.slots[i].hashCode == hashCode) && this.comparer.Equals(this.slots[i].value, value))
      75:                          {
      76:                              return true;
      77:                          }
      78:                      }
      79:                      if (add)
      80:                      {
      81:                          int freeList;
      82:                          if (this.freeList >= 0)
      83:                          {
      84:                              freeList = this.freeList;
      85:                              this.freeList = this.slots[freeList].next;
      86:                          }
      87:                          else
      88:                          {
      89:                              if (this.count == this.slots.Length)
      90:                              {
      91:                                  this.Resize();
      92:                              }
      93:                              freeList = this.count;
      94:                              this.count++;
      95:                          }
      96:                          int index = hashCode % this.buckets.Length;
      97:                          this.slots[freeList].hashCode = hashCode;
      98:                          this.slots[freeList].value = value;
      99:                          this.slots[freeList].next = this.buckets[index] - 1;
     100:                          this.buckets[index] = freeList + 1;
     101:                      }
     102:                      return false;
     103:                  }
     104:           ****/
     105:   
     106:          /// <summary>
     107:          /// 别忘了我们是在做题!所以假设来了
     108:          /// 第一个假设:字符存在范围'a'-'z'
     109:          /// 于是26个字母可以用32位的整型值来影射
     110:          /// </summary>
     111:          public string RemoveDuplicatedChar_3(string s)
     112:          {
     113:              if (string.IsNullOrEmpty(s) || s.Length < 2)
     114:              {
     115:                  return s;
     116:              }
     117:   
     118:              char[] charArray = s.ToCharArray();
     119:              Int32 flags = 0;
     120:              int newIndex = 0;
     121:              for (int i = 0; i < charArray.Length; i++)
     122:              {
     123:                  if (charArray[i] < 'a' || charArray[i] > 'z')
     124:                  {
     125:                      throw new ArgumentException("char should be in range(a-z)");
     126:                  }
     127:   
     128:                  int relative_position = (charArray[i]-'a') % 32;
     129:                  if ((flags & (1 << relative_position))==0)
     130:                  {
     131:                      charArray[newIndex] = charArray[i];
     132:                      newIndex++;
     133:                      flags |= (1 << relative_position);
     134:                  }
     135:              }
     136:   
     137:              return new string(charArray, 0, newIndex);
     138:          }
     139:   
     140:          /// <summary>
     141:          /// 256个ASCII码可以用8个32位整型值映射
     142:          /// </summary>
     143:          public string RemoveDuplicatedChar_4(string s)
     144:          {
     145:              if (string.IsNullOrEmpty(s) || s.Length < 2)
     146:              {
     147:                  return s;
     148:              }
     149:   
     150:              char[] charArray = s.ToCharArray();
     151:              Int32[] flags = new Int32[8];
     152:              int newIndex = 0;
     153:   
     154:              for (int i = 0; i < charArray.Length; i++)
     155:              {
     156:                  if (charArray[i]>255)
     157:                  {
     158:                      throw new ArgumentException("char should be in ASCII");
     159:                  }
     160:   
     161:                  int index = charArray[i] / 32;
     162:                  int relative_position = charArray[i] % 32;
     163:                  if ((flags[index] & (1 << relative_position)) == 0)
     164:                  {
     165:                      charArray[newIndex]=charArray[i];
     166:                      newIndex++;
     167:                      flags[index] |= (1 << relative_position);
     168:                  }
     169:              }
     170:   
     171:              return new string(charArray, 0, newIndex);
     172:          }
     173:   
     174:          /// <summary>
     175:          /// 似曾相识
     176:          /// </summary>
     177:          public string RemoveDuplicatedChar_5(string s)
     178:          {
     179:              if (string.IsNullOrEmpty(s) || s.Length < 2)
     180:              {
     181:                  return s;
     182:              }
     183:   
     184:              char[] charArray = s.ToCharArray();
     185:              bool[] flags = new bool[256];
     186:              int newIndex = 0;
     187:   
     188:              for (int i = 0; i < charArray.Length; i++)
     189:              {
     190:                  if (charArray[i] > 255)
     191:                  {
     192:                      throw new ArgumentException("char should be in ASCII");
     193:                  }
     194:   
     195:                  char c = charArray[i];
     196:                  if (!flags[c])
     197:                  {
     198:                      charArray[newIndex] = charArray[i];
     199:                      newIndex++;
     200:                      flags[c] = true; 
     201:                  }
     202:              }
     203:   
     204:              return new string(charArray, 0, newIndex);
     205:          }
     206:   
     207:          /// <summary>
     208:          /// O(n平方)的实现,没有借助额外的buffer数组或高级数据结构
     209:          /// 不用考虑是否为ASCII,在实际使用中是唯一有通用意义的算法,
     210:          /// 但是至于在处理大字符串时其时间效率应该要远低于使用第一个和第二个方法
     211:          /// 稍后给出运行时间比较
     212:          /// </summary>
     213:          public string RemoveDuplicatedChar_6(string s)
     214:          {
     215:              if (string.IsNullOrEmpty(s) || s.Length < 2)
     216:              {
     217:                  return s;
     218:              }
     219:   
     220:              char[] charArray = s.ToCharArray();
     221:              int newIndex = 1;
     222:   
     223:              for (int i = 1; i < charArray.Length; i++)
     224:              {
     225:                  int j = 0;
     226:                  for (j = 0; j < newIndex; j++)
     227:                  {
     228:                      if (charArray[i] == charArray[j])
     229:                      {
     230:                          break;
     231:                      }
     232:                  }
     233:   
     234:                  //no duplcaited char found in existed read sub-string
     235:                  //then set the new char as the new index
     236:                  if (j == newIndex)
     237:                  {
     238:                      charArray[newIndex] = charArray[i];
     239:                      newIndex++;
     240:                  }
     241:              }
     242:   
     243:              return new string(charArray, 0, newIndex);
     244:          }
     245:      }

    在处理百万级别的文本时,1,2,6的运行时间比较:

    image

    /****
    对于纯ASCII的大字符串,给出了5个方法的时间比较如下图
    可以看出4,5在时间效率上超过了1和2
    所以在处理混合型字符串时,是否应该考虑综合应用这些算法呢?
    答案是明显的
    ****/

    (A1]6ZQ_W67HUTS@4S`OEOK

  • 相关阅读:
    .Net Core Swagger配置
    MySQL如何使用索引
    一个HTTP Basic Authentication引发的异常
    跑步花钱吗?
    跑步花钱吗?
    OpenShift中的持续交付
    在AWS中部署OpenShift平台
    壮美大山包-2017中国大山包国际超百公里ITRA积分赛赛记
    膝盖中了一箭之康复篇-两周年纪念
    HashiCorp Vault介绍
  • 原文地址:https://www.cnblogs.com/dancewithautomation/p/3493772.html
Copyright © 2011-2022 走看看