我试图转换一些字符串,在法国加拿大,基本上,我想能够拿出法国重音标记在字母,同时保持字母。(例如,将é转换为e,那么crème brûlée就会变成creme brulee)

实现这一目标的最佳方法是什么?


当前回答

为所有找到Lucene的人。Net作为一个多余的删除变音符,我设法找到这个小库,利用ASCII音译为您。

https://github.com/anyascii/anyascii

其他回答

试试helppersharp包。

有一个方法removeaccent:

 public static string RemoveAccents(this string source)
 {
     //8 bit characters 
     byte[] b = Encoding.GetEncoding(1251).GetBytes(source);

     // 7 bit characters
     string t = Encoding.ASCII.GetString(b);
     Regex re = new Regex("[^a-zA-Z0-9]=-_/");
     string c = re.Replace(t, " ");
     return c;
 }

公认的答案是完全正确的,但是现在,它应该更新为使用符文类而不是CharUnicodeInfo,因为c#和。net在最新版本中更新了分析字符串的方法(符文类已在。net Core 3.0中添加)。

下面的代码现在推荐用于。net 5+,因为它可以进一步用于非拉丁字符:

static string RemoveDiacritics(string text) 
{
    var normalizedString = text.Normalize(NormalizationForm.FormD);
    var stringBuilder = new StringBuilder();

    foreach (var c in normalizedString.EnumerateRunes())
    {
        var unicodeCategory = Rune.GetUnicodeCategory(c);
        if (unicodeCategory != UnicodeCategory.NonSpacingMark)
        {
            stringBuilder.Append(c);
        }
    }

    return stringBuilder.ToString().Normalize(NormalizationForm.FormC);
}

我需要一些东西,转换所有主要的unicode字符和投票的答案留下了一些,所以我已经创建了一个CodeIgniter的convert_accented_characters($str)的版本为c#,很容易自定义:

using System;
using System.Text;
using System.Collections.Generic;

public static class Strings
{
    static Dictionary<string, string> foreign_characters = new Dictionary<string, string>
    {
        { "äæǽ", "ae" },
        { "öœ", "oe" },
        { "ü", "ue" },
        { "Ä", "Ae" },
        { "Ü", "Ue" },
        { "Ö", "Oe" },
        { "ÀÁÂÃÄÅǺĀĂĄǍΑΆẢẠẦẪẨẬẰẮẴẲẶА", "A" },
        { "àáâãåǻāăąǎªαάảạầấẫẩậằắẵẳặа", "a" },
        { "Б", "B" },
        { "б", "b" },
        { "ÇĆĈĊČ", "C" },
        { "çćĉċč", "c" },
        { "Д", "D" },
        { "д", "d" },
        { "ÐĎĐΔ", "Dj" },
        { "ðďđδ", "dj" },
        { "ÈÉÊËĒĔĖĘĚΕΈẼẺẸỀẾỄỂỆЕЭ", "E" },
        { "èéêëēĕėęěέεẽẻẹềếễểệеэ", "e" },
        { "Ф", "F" },
        { "ф", "f" },
        { "ĜĞĠĢΓГҐ", "G" },
        { "ĝğġģγгґ", "g" },
        { "ĤĦ", "H" },
        { "ĥħ", "h" },
        { "ÌÍÎÏĨĪĬǏĮİΗΉΊΙΪỈỊИЫ", "I" },
        { "ìíîïĩīĭǐįıηήίιϊỉịиыї", "i" },
        { "Ĵ", "J" },
        { "ĵ", "j" },
        { "ĶΚК", "K" },
        { "ķκк", "k" },
        { "ĹĻĽĿŁΛЛ", "L" },
        { "ĺļľŀłλл", "l" },
        { "М", "M" },
        { "м", "m" },
        { "ÑŃŅŇΝН", "N" },
        { "ñńņňʼnνн", "n" },
        { "ÒÓÔÕŌŎǑŐƠØǾΟΌΩΏỎỌỒỐỖỔỘỜỚỠỞỢО", "O" },
        { "òóôõōŏǒőơøǿºοόωώỏọồốỗổộờớỡởợо", "o" },
        { "П", "P" },
        { "п", "p" },
        { "ŔŖŘΡР", "R" },
        { "ŕŗřρр", "r" },
        { "ŚŜŞȘŠΣС", "S" },
        { "śŝşșšſσςс", "s" },
        { "ȚŢŤŦτТ", "T" },
        { "țţťŧт", "t" },
        { "ÙÚÛŨŪŬŮŰŲƯǓǕǗǙǛŨỦỤỪỨỮỬỰУ", "U" },
        { "ùúûũūŭůűųưǔǖǘǚǜυύϋủụừứữửựу", "u" },
        { "ÝŸŶΥΎΫỲỸỶỴЙ", "Y" },
        { "ýÿŷỳỹỷỵй", "y" },
        { "В", "V" },
        { "в", "v" },
        { "Ŵ", "W" },
        { "ŵ", "w" },
        { "ŹŻŽΖЗ", "Z" },
        { "źżžζз", "z" },
        { "ÆǼ", "AE" },
        { "ß", "ss" },
        { "IJ", "IJ" },
        { "ij", "ij" },
        { "Œ", "OE" },
        { "ƒ", "f" },
        { "ξ", "ks" },
        { "π", "p" },
        { "β", "v" },
        { "μ", "m" },
        { "ψ", "ps" },
        { "Ё", "Yo" },
        { "ё", "yo" },
        { "Є", "Ye" },
        { "є", "ye" },
        { "Ї", "Yi" },
        { "Ж", "Zh" },
        { "ж", "zh" },
        { "Х", "Kh" },
        { "х", "kh" },
        { "Ц", "Ts" },
        { "ц", "ts" },
        { "Ч", "Ch" },
        { "ч", "ch" },
        { "Ш", "Sh" },
        { "ш", "sh" },
        { "Щ", "Shch" },
        { "щ", "shch" },
        { "ЪъЬь", "" },
        { "Ю", "Yu" },
        { "ю", "yu" },
        { "Я", "Ya" },
        { "я", "ya" },
    };

    public static char RemoveDiacritics(this char c){
        foreach(KeyValuePair<string, string> entry in foreign_characters)
        {
            if(entry.Key.IndexOf (c) != -1)
            {
                return entry.Value[0];
            }
        }
        return c;
    }

    public static string RemoveDiacritics(this string s) 
    {
        //StringBuilder sb = new StringBuilder ();
        string text = "";


        foreach (char c in s)
        {
            int len = text.Length;

            foreach(KeyValuePair<string, string> entry in foreign_characters)
            {
                if(entry.Key.IndexOf (c) != -1)
                {
                    text += entry.Value;
                    break;
                }
            }

            if (len == text.Length) {
                text += c;  
            }
        }
        return text;
    }
}

使用

// for strings
"crème brûlée".RemoveDiacritics (); // creme brulee

// for chars
"Ã"[0].RemoveDiacritics (); // A

我经常使用基于我在这里找到的另一个版本的扩展方法 (参见在c# (ascii)中替换字符) 简单解释一下:

归一化形成D,将è等字符分割为e和非空格' 由此,nospacing字符被移除 结果归一化回形式C(我不确定这是否有必要)

代码:

using System.Linq;
using System.Text;
using System.Globalization;

// namespace here
public static class Utility
{
    public static string RemoveDiacritics(this string str)
    {
        if (null == str) return null;
        var chars =
            from c in str.Normalize(NormalizationForm.FormD).ToCharArray()
            let uc = CharUnicodeInfo.GetUnicodeCategory(c)
            where uc != UnicodeCategory.NonSpacingMark
            select c;

        var cleanStr = new string(chars.ToArray()).Normalize(NormalizationForm.FormC);

        return cleanStr;
    }

    // or, alternatively
    public static string RemoveDiacritics2(this string str)
    {
        if (null == str) return null;
        var chars = str
            .Normalize(NormalizationForm.FormD)
            .ToCharArray()
            .Where(c=> CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
            .ToArray();

        return new string(chars).Normalize(NormalizationForm.FormC);
    }
}

你可以从MMLib中使用字符串扩展。扩展nuget包:

using MMLib.RapidPrototyping.Generators;
public void ExtensionsExample()
{
  string target = "aácčeéií";
  Assert.AreEqual("aacceeii", target.RemoveDiacritics());
} 

Nuget页面:https://www.nuget.org/packages/MMLib.Extensions/ Codeplex项目网站https://mmlib.codeplex.com/