如何从字符串中删除重音字符? 特别是在IE6中,我有这样的东西:

accentsTidy = function(s){
    var r=s.toLowerCase();
    r = r.replace(new RegExp(/\s/g),"");
    r = r.replace(new RegExp(/[àáâãäå]/g),"a");
    r = r.replace(new RegExp(/æ/g),"ae");
    r = r.replace(new RegExp(/ç/g),"c");
    r = r.replace(new RegExp(/[èéêë]/g),"e");
    r = r.replace(new RegExp(/[ìíîï]/g),"i");
    r = r.replace(new RegExp(/ñ/g),"n");                
    r = r.replace(new RegExp(/[òóôõö]/g),"o");
    r = r.replace(new RegExp(/œ/g),"oe");
    r = r.replace(new RegExp(/[ùúûü]/g),"u");
    r = r.replace(new RegExp(/[ýÿ]/g),"y");
    r = r.replace(new RegExp(/\W/g),"");
    return r;
};

但是IE6让我很烦,它好像不喜欢我的正则表达式。


当前回答

这里有一个非常简单的解决方案,没有太多的代码,使用一个非常简单的变音符的映射,其中包括一些或全部映射到包含多个字符的ascii等价物,即Æ => AE, ffi => ffi,等等…还包括一些非常基本的功能测试

var diacriticsMap = {
  '\u00C0': 'A',  // À => A
  '\u00C1': 'A',   // Á => A
  '\u00C2': 'A',   // Â => A
  '\u00C3': 'A',   // Ã => A
  '\u00C4': 'A',   // Ä => A
  '\u00C5': 'A',   // Å => A
  '\u00C6': 'AE', // Æ => AE
  '\u00C7': 'C',   // Ç => C
  '\u00C8': 'E',   // È => E
  '\u00C9': 'E',   // É => E
  '\u00CA': 'E',   // Ê => E
  '\u00CB': 'E',   // Ë => E
  '\u00CC': 'I',   // Ì => I
  '\u00CD': 'I',   // Í => I
  '\u00CE': 'I',   // Î => I
  '\u00CF': 'I',   // Ï => I
  '\u0132': 'IJ', // IJ => IJ
  '\u00D0': 'D',   // Ð => D
  '\u00D1': 'N',   // Ñ => N
  '\u00D2': 'O',   // Ò => O
  '\u00D3': 'O',   // Ó => O
  '\u00D4': 'O',   // Ô => O
  '\u00D5': 'O',   // Õ => O
  '\u00D6': 'O',   // Ö => O
  '\u00D8': 'O',   // Ø => O
  '\u0152': 'OE', // Œ => OE
  '\u00DE': 'TH', // Þ => TH
  '\u00D9': 'U',   // Ù => U
  '\u00DA': 'U',   // Ú => U
  '\u00DB': 'U',   // Û => U
  '\u00DC': 'U',   // Ü => U
  '\u00DD': 'Y',   // Ý => Y
  '\u0178': 'Y',   // Ÿ => Y
  '\u00E0': 'a',   // à => a
  '\u00E1': 'a',   // á => a
  '\u00E2': 'a',   // â => a
  '\u00E3': 'a',   // ã => a
  '\u00E4': 'a',   // ä => a
  '\u00E5': 'a',   // å => a
  '\u00E6': 'ae', // æ => ae
  '\u00E7': 'c',   // ç => c
  '\u00E8': 'e',   // è => e
  '\u00E9': 'e',   // é => e
  '\u00EA': 'e',   // ê => e
  '\u00EB': 'e',   // ë => e
  '\u00EC': 'i',   // ì => i
  '\u00ED': 'i',   // í => i
  '\u00EE': 'i',   // î => i
  '\u00EF': 'i',   // ï => i
  '\u0133': 'ij', // ij => ij
  '\u00F0': 'd',   // ð => d
  '\u00F1': 'n',   // ñ => n
  '\u00F2': 'o',   // ò => o
  '\u00F3': 'o',   // ó => o
  '\u00F4': 'o',   // ô => o
  '\u00F5': 'o',   // õ => o
  '\u00F6': 'o',   // ö => o
  '\u00F8': 'o',   // ø => o
  '\u0153': 'oe', // œ => oe
  '\u00DF': 'ss', // ß => ss
  '\u00FE': 'th', // þ => th
  '\u00F9': 'u',   // ù => u
  '\u00FA': 'u',   // ú => u
  '\u00FB': 'u',   // û => u
  '\u00FC': 'u',   // ü => u
  '\u00FD': 'y',   // ý => y
  '\u00FF': 'y',   // ÿ => y
  '\uFB00': 'ff', // ff => ff
  '\uFB01': 'fi',   // fi => fi
  '\uFB02': 'fl', // fl => fl
  '\uFB03': 'ffi',  // ffi => ffi
  '\uFB04': 'ffl',  // ffl => ffl
  '\uFB05': 'ft', // ſt => ft
  '\uFB06': 'st'  // st => st
};

function replaceDiacritics(str) {
  var returnStr = '';
  if(str) {
    for (var i = 0; i < str.length; i++) {
      if (diacriticsMap[str[i]]) {
        returnStr += diacriticsMap[str[i]];
      } else {
        returnStr += str[i];
      }
    }
  }
  return returnStr;
}

function testStripDiacritics(input, expected) {
  var coChar = replaceDiacritics(input);
  console.log('The character passed in was ' + input);
  console.log('The character that came out was ' + coChar);
  console.log('The character expected was' + expected);
}

testStripDiacritics('À','A');
testStripDiacritics('A','A');
testStripDiacritics('Æ','AE');
testStripDiacritics('AE','AE');
testStripDiacritics('ÇhÀrlËšYŸZŽ','ChArlEsYYZZ');

其他回答

我的看法是,去掉所有口音和特殊字符。我混合了Lewis Diamond优雅的重音删除方法和社区wiki的快速字符替换余下的字符。结果是一个Ascii字符串。

function convertToAscii(string) {
    const unicodeToAsciiMap = {'Ⱥ':'A','Æ':'AE','Ꜻ':'AV','Ɓ':'B','Ƀ':'B','Ƃ':'B','Ƈ':'C','Ȼ':'C','Ɗ':'D','Dz':'D','Dž':'D','Đ':'D','Ƌ':'D','DŽ':'DZ','Ɇ':'E','Ꝫ':'ET','Ƒ':'F','Ɠ':'G','Ǥ':'G','Ⱨ':'H','Ħ':'H','Ɨ':'I','Ꝺ':'D','Ꝼ':'F','Ᵹ':'G','Ꞃ':'R','Ꞅ':'S','Ꞇ':'T','Ꝭ':'IS','Ɉ':'J','Ⱪ':'K','Ꝃ':'K','Ƙ':'K','Ꝁ':'K','Ꝅ':'K','Ƚ':'L','Ⱡ':'L','Ꝉ':'L','Ŀ':'L','Ɫ':'L','Lj':'L','Ł':'L','Ɱ':'M','Ɲ':'N','Ƞ':'N','Nj':'N','Ꝋ':'O','Ꝍ':'O','Ɵ':'O','Ø':'O','Ƣ':'OI','Ɛ':'E','Ɔ':'O','Ȣ':'OU','Ꝓ':'P','Ƥ':'P','Ꝕ':'P','Ᵽ':'P','Ꝑ':'P','Ꝙ':'Q','Ꝗ':'Q','Ɍ':'R','Ɽ':'R','Ꜿ':'C','Ǝ':'E','Ⱦ':'T','Ƭ':'T','Ʈ':'T','Ŧ':'T','Ɐ':'A','Ꞁ':'L','Ɯ':'M','Ʌ':'V','Ꝟ':'V','Ʋ':'V','Ⱳ':'W','Ƴ':'Y','Ỿ':'Y','Ɏ':'Y','Ⱬ':'Z','Ȥ':'Z','Ƶ':'Z','Œ':'OE','ᴀ':'A','ᴁ':'AE','ʙ':'B','ᴃ':'B','ᴄ':'C','ᴅ':'D','ᴇ':'E','ꜰ':'F','ɢ':'G','ʛ':'G','ʜ':'H','ɪ':'I','ʁ':'R','ᴊ':'J','ᴋ':'K','ʟ':'L','ᴌ':'L','ᴍ':'M','ɴ':'N','ᴏ':'O','ɶ':'OE','ᴐ':'O','ᴕ':'OU','ᴘ':'P','ʀ':'R','ᴎ':'N','ᴙ':'R','ꜱ':'S','ᴛ':'T','ⱻ':'E','ᴚ':'R','ᴜ':'U','ᴠ':'V','ᴡ':'W','ʏ':'Y','ᴢ':'Z','ᶏ':'a','ẚ':'a','ⱥ':'a','æ':'ae','ꜻ':'av','ɓ':'b','ᵬ':'b','ᶀ':'b','ƀ':'b','ƃ':'b','ɵ':'o','ɕ':'c','ƈ':'c','ȼ':'c','ȡ':'d','ɗ':'d','ᶑ':'d','ᵭ':'d','ᶁ':'d','đ':'d','ɖ':'d','ƌ':'d','ı':'i','ȷ':'j','ɟ':'j','ʄ':'j','dž':'dz','ⱸ':'e','ᶒ':'e','ɇ':'e','ꝫ':'et','ƒ':'f','ᵮ':'f','ᶂ':'f','ɠ':'g','ᶃ':'g','ǥ':'g','ⱨ':'h','ɦ':'h','ħ':'h','ƕ':'hv','ᶖ':'i','ɨ':'i','ꝺ':'d','ꝼ':'f','ᵹ':'g','ꞃ':'r','ꞅ':'s','ꞇ':'t','ꝭ':'is','ʝ':'j','ɉ':'j','ⱪ':'k','ꝃ':'k','ƙ':'k','ᶄ':'k','ꝁ':'k','ꝅ':'k','ƚ':'l','ɬ':'l','ȴ':'l','ⱡ':'l','ꝉ':'l','ŀ':'l','ɫ':'l','ᶅ':'l','ɭ':'l','ł':'l','ſ':'s','ẜ':'s','ẝ':'s','ɱ':'m','ᵯ':'m','ᶆ':'m','ȵ':'n','ɲ':'n','ƞ':'n','ᵰ':'n','ᶇ':'n','ɳ':'n','ꝋ':'o','ꝍ':'o','ⱺ':'o','ø':'o','ƣ':'oi','ɛ':'e','ᶓ':'e','ɔ':'o','ᶗ':'o','ȣ':'ou','ꝓ':'p','ƥ':'p','ᵱ':'p','ᶈ':'p','ꝕ':'p','ᵽ':'p','ꝑ':'p','ꝙ':'q','ʠ':'q','ɋ':'q','ꝗ':'q','ɾ':'r','ᵳ':'r','ɼ':'r','ᵲ':'r','ᶉ':'r','ɍ':'r','ɽ':'r','ↄ':'c','ꜿ':'c','ɘ':'e','ɿ':'r','ʂ':'s','ᵴ':'s','ᶊ':'s','ȿ':'s','ɡ':'g','ᴑ':'o','ᴓ':'o','ᴝ':'u','ȶ':'t','ⱦ':'t','ƭ':'t','ᵵ':'t','ƫ':'t','ʈ':'t','ŧ':'t','ᵺ':'th','ɐ':'a','ᴂ':'ae','ǝ':'e','ᵷ':'g','ɥ':'h','ʮ':'h','ʯ':'h','ᴉ':'i','ʞ':'k','ꞁ':'l','ɯ':'m','ɰ':'m','ᴔ':'oe','ɹ':'r','ɻ':'r','ɺ':'r','ⱹ':'r','ʇ':'t','ʌ':'v','ʍ':'w','ʎ':'y','ᶙ':'u','ᵫ':'ue','ꝸ':'um','ⱴ':'v','ꝟ':'v','ʋ':'v','ᶌ':'v','ⱱ':'v','ⱳ':'w','ᶍ':'x','ƴ':'y','ỿ':'y','ɏ':'y','ʑ':'z','ⱬ':'z','ȥ':'z','ᵶ':'z','ᶎ':'z','ʐ':'z','ƶ':'z','ɀ':'z','œ':'oe','ₓ':'x'};
    const stringWithoutAccents = string.normalize("NFD").replace(/[\u0300-\u036f]/g, '');
    return stringWithoutAccents.replace(/[^\u0000-\u007E]/g, character => unicodeToAsciiMap[character] || '');
}

新RegExp的格式为

RegExp(something, 'modifiers');

所以你会想

accentsTidy = function(s){
                        var r=s.toLowerCase();
                        r = r.replace(new RegExp("\\s", 'g'),"");
                        r = r.replace(new RegExp("[àáâãäå]", 'g'),"a");
                        r = r.replace(new RegExp("æ", 'g'),"ae");
                        r = r.replace(new RegExp("ç", 'g'),"c");
                        r = r.replace(new RegExp("[èéêë]", 'g'),"e");
                        r = r.replace(new RegExp("[ìíîï]", 'g'),"i");
                        r = r.replace(new RegExp("ñ", 'g'),"n");                            
                        r = r.replace(new RegExp("[òóôõö]", 'g'),"o");
                        r = r.replace(new RegExp("œ", 'g'),"oe");
                        r = r.replace(new RegExp("[ùúûü]", 'g'),"u");
                        r = r.replace(new RegExp("[ýÿ]", 'g'),"y");
                        r = r.replace(new RegExp("\\W", 'g'),"");
                        return r;
                };

你可以用多种方式创建正则表达式。使用新的regexp -构造函数:

var re = new RegExp("[a-z]", "ig") //(string pattern, string modifiers)

或者使用正则表达式文字表示法:

var re = /[a-z]/ig; // /pattern/modifiers

你把两者混在一起了。

有很多这样的方法,但我认为这个方法简单且足够好:

 function remove_accents(strAccents) {
    var strAccents = strAccents.split('');
    var strAccentsOut = new Array();
    var strAccentsLen = strAccents.length;
    var accents =    "ÀÁÂÃÄÅàáâãäåÒÓÔÕÕÖØòóôõöøÈÉÊËèéêëÇçðÐÌÍÎÏìíîïÙÚÛÜùúûüÑñŠšŸÿýŽž";
    var accentsOut = "AAAAAAaaaaaaOOOOOOOooooooEEEEeeeeCcdDIIIIiiiiUUUUuuuuNnSsYyyZz";
    for (var y = 0; y < strAccentsLen; y++) {
        if (accents.indexOf(strAccents[y]) != -1) {
            strAccentsOut[y] = accentsOut.substr(accents.indexOf(strAccents[y]), 1);
        } else
            strAccentsOut[y] = strAccents[y];
    }
    strAccentsOut = strAccentsOut.join('');

    return strAccentsOut;
}

如果你还想删除特殊字符,并转换下划线中的空格和连字符,请执行以下操作:

string = remove_accents(string);
string = string.replace(/[^a-z0-9\s]/gi, '').replace(/[-\s]/g, '_');

谢谢大家 我使用这个版本并说明原因(因为我在开始时错过了这些解释,所以我试图帮助下一个读者,如果他和我一样无聊…)

备注:我想要一个高效的解决方案,所以:

只有一次regexp编译(如果需要) 每个字符串只扫描一个字符串 查找已翻译字符的有效方法 等等……

我的版本是: (里面没有新的技术技巧,只有一些精选的+解释)

makeSortString = (function() {
    var translate_re = /[¹²³áàâãäåaaaÀÁÂÃÄÅAAAÆccç©CCÇÐÐèéê?ëeeeeeÈÊË?EEEEE€gGiìíîïìiiiÌÍÎÏ?ÌIIIlLnnñNNÑòóôõöoooøÒÓÔÕÖOOOØŒr®Ršs?ߊS?ùúûüuuuuÙÚÛÜUUUUýÿÝŸžzzŽZZ]/g;
    var translate = {
"¹":"1","²":"2","³":"3","á":"a","à":"a","â":"a","ã":"a","ä":"a","å":"a","a":"a","a":"a","a":"a","À":"a","Á":"a","Â":"a","Ã":"a","Ä":"a","Å":"a","A":"a","A":"a",
"A":"a","Æ":"a","c":"c","c":"c","ç":"c","©":"c","C":"c","C":"c","Ç":"c","Ð":"d","Ð":"d","è":"e","é":"e","ê":"e","?":"e","ë":"e","e":"e","e":"e","e":"e","e":"e",
"e":"e","È":"e","Ê":"e","Ë":"e","?":"e","E":"e","E":"e","E":"e","E":"e","E":"e","€":"e","g":"g","G":"g","i":"i","ì":"i","í":"i","î":"i","ï":"i","ì":"i","i":"i",
"i":"i","i":"i","Ì":"i","Í":"i","Î":"i","Ï":"i","?":"i","Ì":"i","I":"i","I":"i","I":"i","l":"l","L":"l","n":"n","n":"n","ñ":"n","N":"n","N":"n","Ñ":"n","ò":"o",
"ó":"o","ô":"o","õ":"o","ö":"o","o":"o","o":"o","o":"o","ø":"o","Ò":"o","Ó":"o","Ô":"o","Õ":"o","Ö":"o","O":"o","O":"o","O":"o","Ø":"o","Œ":"o","r":"r","®":"r",
"R":"r","š":"s","s":"s","?":"s","ß":"s","Š":"s","S":"s","?":"s","ù":"u","ú":"u","û":"u","ü":"u","u":"u","u":"u","u":"u","u":"u","Ù":"u","Ú":"u","Û":"u","Ü":"u",
"U":"u","U":"u","U":"u","U":"u","ý":"y","ÿ":"y","Ý":"y","Ÿ":"y","ž":"z","z":"z","z":"z","Ž":"z","Z":"z","Z":"z"
    };
    return function(s) {
        return(s.replace(translate_re, function(match){return translate[match];}) );
    }
})();

我是这样用的:

var without_accents = makeSortString("wïthêüÄTrèsBïgüeAk100t");
// I let you guess the result,
// no I was kidding you : I give you the result : witheuatresbigueak100t

评论:

Tthe instruction inside it is done once (after, makeSortString != undefined) function(){...} is stored once in makeSortString, so the "big" translate_re and translate objects are stored once When you call makeSortString('something') it call directly the inside function which calls only s.replace(...) : it is efficient s.replace uses regexp (the special syntax of var translate_re= .... is in fact equivalent to var translate_re = new RegExp("[¹....Z]","g"); but the compilation of the regexp is done once for all, and the scan of the s String is done one for a call of the function (not for every character as it would be in a loop) For each character found s.replace calls function(match) where parameter match contains the character found, and it call the corresponding translated character (translate[match]) Translate[match] is probably efficient too as the javascript translate object is probably implemented by javascript with a hashtab or something equivalent and allow the program to find the translated character almost directly and not for instance through a loop on a array of all characters to find the right one (which would be awfully unefficient).