我试图转换一些字符串,在法国加拿大,基本上,我想能够拿出法国重音标记在字母,同时保持字母。(例如,将é转换为e,那么crème brûlée就会变成creme brulee)

实现这一目标的最佳方法是什么?


当前回答

有趣的是,这样的问题可以得到这么多答案,但没有一个符合我的要求:)周围有这么多语言,一个完全语言不可知的解决方案是不可能的,因为其他人已经提到FormC或FormD正在给出问题。

由于最初的问题与法语有关,最简单的答案确实是

    public static string ConvertWesternEuropeanToASCII(this string str)
    {
        return Encoding.ASCII.GetString(Encoding.GetEncoding(1251).GetBytes(str));
    }

1251应该被输入语言的编码代码所取代。

但是,这只能用一个字符替换一个字符。由于我也使用德语作为输入,所以我进行了手动转换

    public static string LatinizeGermanCharacters(this string str)
    {
        StringBuilder sb = new StringBuilder(str.Length);
        foreach (char c in str)
        {
            switch (c)
            {
                case 'ä':
                    sb.Append("ae");
                    break;
                case 'ö':
                    sb.Append("oe");
                    break;
                case 'ü':
                    sb.Append("ue");
                    break;
                case 'Ä':
                    sb.Append("Ae");
                    break;
                case 'Ö':
                    sb.Append("Oe");
                    break;
                case 'Ü':
                    sb.Append("Ue");
                    break;
                case 'ß':
                    sb.Append("ss");
                    break;
                default:
                    sb.Append(c);
                    break;
            }
        }
        return sb.ToString();
    }

它可能不能提供最好的性能,但至少它非常易于阅读和扩展。 Regex是一个不去,比任何字符/字符串的东西慢得多。

我还有一个非常简单的方法来删除空间:

    public static string RemoveSpace(this string str)
    {
        return str.Replace(" ", string.Empty);
    }

最终,我使用以上3个扩展的组合:

    public static string LatinizeAndConvertToASCII(this string str, bool keepSpace = false)
    {
        str = str.LatinizeGermanCharacters().ConvertWesternEuropeanToASCII();            
        return keepSpace ? str : str.RemoveSpace();
    }

并对其进行了一个小单元测试(不详尽),以成功通过。

    [TestMethod()]
    public void LatinizeAndConvertToASCIITest()
    {
        string europeanStr = "Bonjour ça va? C'est l'été! Ich möchte ä Ä á à â ê é è ë Ë É ï Ï î í ì ó ò ô ö Ö Ü ü ù ú û Û ý Ý ç Ç ñ Ñ";
        string expected = "Bonjourcava?C'estl'ete!IchmoechteaeAeaaaeeeeEEiIiiiooooeOeUeueuuuUyYcCnN";
        string actual = europeanStr.LatinizeAndConvertToASCII();
        Assert.AreEqual(expected, actual);
    }

其他回答

这是VB版本(工作与希腊):

导入系统。文本

导入系统。全球化

Public Function RemoveDiacritics(ByVal s As String)
    Dim normalizedString As String
    Dim stringBuilder As New StringBuilder
    normalizedString = s.Normalize(NormalizationForm.FormD)
    Dim i As Integer
    Dim c As Char
    For i = 0 To normalizedString.Length - 1
        c = normalizedString(i)
        If CharUnicodeInfo.GetUnicodeCategory(c) <> UnicodeCategory.NonSpacingMark Then
            stringBuilder.Append(c)
        End If
    Next
    Return stringBuilder.ToString()
End Function

我经常使用基于我在这里找到的另一个版本的扩展方法 (参见在c# (ascii)中替换字符) 简单解释一下:

归一化形成D,将è等字符分割为e和非空格' 由此,nospacing字符被移除 结果归一化回形式C(我不确定这是否有必要)

代码:

using System.Linq;
using System.Text;
using System.Globalization;

// namespace here
public static class Utility
{
    public static string RemoveDiacritics(this string str)
    {
        if (null == str) return null;
        var chars =
            from c in str.Normalize(NormalizationForm.FormD).ToCharArray()
            let uc = CharUnicodeInfo.GetUnicodeCategory(c)
            where uc != UnicodeCategory.NonSpacingMark
            select c;

        var cleanStr = new string(chars.ToArray()).Normalize(NormalizationForm.FormC);

        return cleanStr;
    }

    // or, alternatively
    public static string RemoveDiacritics2(this string str)
    {
        if (null == str) return null;
        var chars = str
            .Normalize(NormalizationForm.FormD)
            .ToCharArray()
            .Where(c=> CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
            .ToArray();

        return new string(chars).Normalize(NormalizationForm.FormC);
    }
}

如果有人感兴趣,我正在寻找类似的东西,最后写了如下:

public static string NormalizeStringForUrl(string name)
{
    String normalizedString = name.Normalize(NormalizationForm.FormD);
    StringBuilder stringBuilder = new StringBuilder();

    foreach (char c in normalizedString)
    {
        switch (CharUnicodeInfo.GetUnicodeCategory(c))
        {
            case UnicodeCategory.LowercaseLetter:
            case UnicodeCategory.UppercaseLetter:
            case UnicodeCategory.DecimalDigitNumber:
                stringBuilder.Append(c);
                break;
            case UnicodeCategory.SpaceSeparator:
            case UnicodeCategory.ConnectorPunctuation:
            case UnicodeCategory.DashPunctuation:
                stringBuilder.Append('_');
                break;
        }
    }
    string result = stringBuilder.ToString();
    return String.Join("_", result.Split(new char[] { '_' }
        , StringSplitOptions.RemoveEmptyEntries)); // remove duplicate underscores
}

这就是我如何在所有的。net程序中替换变音符字符为非变音符字符

C#:

//Transforms the culture of a letter to its equivalent representation in the 0-127 ascii table, such as the letter 'é' is substituted by an 'e'
public string RemoveDiacritics(string s)
{
    string normalizedString = null;
    StringBuilder stringBuilder = new StringBuilder();
    normalizedString = s.Normalize(NormalizationForm.FormD);
    int i = 0;
    char c = '\0';

    for (i = 0; i <= normalizedString.Length - 1; i++)
    {
        c = normalizedString[i];
        if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
        {
            stringBuilder.Append(c);
        }
    }

    return stringBuilder.ToString().ToLower();
}

VB .NET:

'Transforms the culture of a letter to its equivalent representation in the 0-127 ascii table, such as the letter "é" is substituted by an "e"'
Public Function RemoveDiacritics(ByVal s As String) As String
    Dim normalizedString As String
    Dim stringBuilder As New StringBuilder
    normalizedString = s.Normalize(NormalizationForm.FormD)
    Dim i As Integer
    Dim c As Char

    For i = 0 To normalizedString.Length - 1
        c = normalizedString(i)
        If CharUnicodeInfo.GetUnicodeCategory(c) <> UnicodeCategory.NonSpacingMark Then
            stringBuilder.Append(c)
        End If
    Next
    Return stringBuilder.ToString().ToLower()
End Function

公认的答案是完全正确的,但是现在,它应该更新为使用符文类而不是CharUnicodeInfo,因为c#和。net在最新版本中更新了分析字符串的方法(符文类已在。net Core 3.0中添加)。

下面的代码现在推荐用于。net 5+,因为它可以进一步用于非拉丁字符:

static string RemoveDiacritics(string text) 
{
    var normalizedString = text.Normalize(NormalizationForm.FormD);
    var stringBuilder = new StringBuilder();

    foreach (var c in normalizedString.EnumerateRunes())
    {
        var unicodeCategory = Rune.GetUnicodeCategory(c);
        if (unicodeCategory != UnicodeCategory.NonSpacingMark)
        {
            stringBuilder.Append(c);
        }
    }

    return stringBuilder.ToString().Normalize(NormalizationForm.FormC);
}