什么是一个好的完整正则表达式或其他一些过程,将采取标题:

如何将标题更改为URL的一部分,如堆栈溢出?

然后把它变成

how-do-you-change-a-title-to-be-part-of-the-url-like-stack-overflow

在堆栈溢出的seo友好的url中使用?

我使用的开发环境是Ruby on Rails,但是如果有一些其他特定于平台的解决方案(。NET, PHP, Django),我也很想看到这些。

我相信我(或其他读者)在不同的平台上也会遇到同样的问题。

我使用自定义路由,我主要想知道如何改变字符串的所有特殊字符被删除,它都是小写的,所有空白被替换。


当前回答

你也可以使用这个JavaScript函数在表单中生成鼻涕虫(这个是基于/从Django复制的):

function makeSlug(urlString, filter) {
    // Changes, e.g., "Petty theft" to "petty_theft".
    // Remove all these words from the string before URLifying

    if(filter) {
        removelist = ["a", "an", "as", "at", "before", "but", "by", "for", "from",
        "is", "in", "into", "like", "of", "off", "on", "onto", "per",
        "since", "than", "the", "this", "that", "to", "up", "via", "het", "de", "een", "en",
        "with"];
    }
    else {
        removelist = [];
    }
    s = urlString;
    r = new RegExp('\\b(' + removelist.join('|') + ')\\b', 'gi');
    s = s.replace(r, '');
    s = s.replace(/[^-\w\s]/g, ''); // Remove unneeded characters
    s = s.replace(/^\s+|\s+$/g, ''); // Trim leading/trailing spaces
    s = s.replace(/[-\s]+/g, '-'); // Convert spaces to hyphens
    s = s.toLowerCase(); // Convert to lowercase
    return s; // Trim to first num_chars characters
}

其他回答

我们是这样做的。注意,可能有比你第一眼意识到的更多的边缘条件。

这是第二个版本,展开后的性能提高了5倍(是的,我对它进行了基准测试)。我认为我应该优化它,因为这个函数可以在每页被调用数百次。

/// <summary>
/// Produces optional, URL-friendly version of a title, "like-this-one". 
/// hand-tuned for speed, reflects performance refactoring contributed
/// by John Gietzen (user otac0n) 
/// </summary>
public static string URLFriendly(string title)
{
    if (title == null) return "";

    const int maxlen = 80;
    int len = title.Length;
    bool prevdash = false;
    var sb = new StringBuilder(len);
    char c;

    for (int i = 0; i < len; i++)
    {
        c = title[i];
        if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9'))
        {
            sb.Append(c);
            prevdash = false;
        }
        else if (c >= 'A' && c <= 'Z')
        {
            // tricky way to convert to lowercase
            sb.Append((char)(c | 32));
            prevdash = false;
        }
        else if (c == ' ' || c == ',' || c == '.' || c == '/' || 
            c == '\\' || c == '-' || c == '_' || c == '=')
        {
            if (!prevdash && sb.Length > 0)
            {
                sb.Append('-');
                prevdash = true;
            }
        }
        else if ((int)c >= 128)
        {
            int prevlen = sb.Length;
            sb.Append(RemapInternationalCharToAscii(c));
            if (prevlen != sb.Length) prevdash = false;
        }
        if (i == maxlen) break;
    }

    if (prevdash)
        return sb.ToString().Substring(0, sb.Length - 1);
    else
        return sb.ToString();
}

要查看被替换的代码的前一个版本(但在功能上与之相当,而且快了5倍),请查看这篇文章的修订历史(单击日期链接)。

另外,RemapInternationalCharToAscii方法的源代码可以在这里找到。

我知道这是一个非常老的问题,但由于大多数浏览器现在都支持unicode url,我在XRegex中找到了一个很好的解决方案,它可以转换除字母以外的所有内容(在所有语言中都是'-')。

这可以在几种编程语言中实现。

模式是\\p{^L}+,然后你只需要用它来替换所有非字母到'-'。

node.js中xregex模块的工作示例。

var text = 'This ! can @ have # several $ letters % from different languages such as עברית or Español';

var slugRegEx = XRegExp('((?!\\d)\\p{^L})+', 'g');

var slug = XRegExp.replace(text, slugRegEx, '-').toLowerCase();

console.log(slug) ==> "this-can-have-several-letters-from-different-languages-such-as-עברית-or-español"

我将代码移植到TypeScript中。它可以很容易地适应JavaScript。

我添加了一个.contains方法到字符串原型,如果你的目标是最新的浏览器或ES6,你可以使用.includes代替。

if (!String.prototype.contains) {
    String.prototype.contains = function (check) {
        return this.indexOf(check, 0) !== -1;
    };
}

declare interface String {
    contains(check: string): boolean;
}

export function MakeUrlFriendly(title: string) {
            if (title == null || title == '')
                return '';

            const maxlen = 80;
            let len = title.length;
            let prevdash = false;
            let result = '';
            let c: string;
            let cc: number;
            let remapInternationalCharToAscii = function (c: string) {
                let s = c.toLowerCase();
                if ("àåáâäãåą".contains(s)) {
                    return "a";
                }
                else if ("èéêëę".contains(s)) {
                    return "e";
                }
                else if ("ìíîïı".contains(s)) {
                    return "i";
                }
                else if ("òóôõöøőð".contains(s)) {
                    return "o";
                }
                else if ("ùúûüŭů".contains(s)) {
                    return "u";
                }
                else if ("çćčĉ".contains(s)) {
                    return "c";
                }
                else if ("żźž".contains(s)) {
                    return "z";
                }
                else if ("śşšŝ".contains(s)) {
                    return "s";
                }
                else if ("ñń".contains(s)) {
                    return "n";
                }
                else if ("ýÿ".contains(s)) {
                    return "y";
                }
                else if ("ğĝ".contains(s)) {
                    return "g";
                }
                else if (c == 'ř') {
                    return "r";
                }
                else if (c == 'ł') {
                    return "l";
                }
                else if (c == 'đ') {
                    return "d";
                }
                else if (c == 'ß') {
                    return "ss";
                }
                else if (c == 'Þ') {
                    return "th";
                }
                else if (c == 'ĥ') {
                    return "h";
                }
                else if (c == 'ĵ') {
                    return "j";
                }
                else {
                    return "";
                }
            };

            for (let i = 0; i < len; i++) {
                c = title[i];
                cc = c.charCodeAt(0);

                if ((cc >= 97 /* a */ && cc <= 122 /* z */) || (cc >= 48 /* 0 */ && cc <= 57 /* 9 */)) {
                    result += c;
                    prevdash = false;
                }
                else if ((cc >= 65 && cc <= 90 /* A - Z */)) {
                    result += c.toLowerCase();
                    prevdash = false;
                }
                else if (c == ' ' || c == ',' || c == '.' || c == '/' || c == '\\' || c == '-' || c == '_' || c == '=') {
                    if (!prevdash && result.length > 0) {
                        result += '-';
                        prevdash = true;
                    }
                }
                else if (cc >= 128) {
                    let prevlen = result.length;
                    result += remapInternationalCharToAscii(c);
                    if (prevlen != result.length) prevdash = false;
                }
                if (i == maxlen) break;
            }

            if (prevdash)
                return result.substring(0, result.length - 1);
            else
                return result;
        }

下面是Jeff代码的我的版本。我做了以下修改:

The hyphens were appended in such a way that one could be added, and then need removing as it was the last character in the string. That is, we never want “my-slug-”. This means an extra string allocation to remove it on this edge case. I’ve worked around this by delay-hyphening. If you compare my code to Jeff’s the logic for this is easy to follow. His approach is purely lookup based and missed a lot of characters I found in examples while researching on Stack Overflow. To counter this, I first peform a normalisation pass (AKA collation mentioned in Meta Stack Overflow question Non US-ASCII characters dropped from full (profile) URL), and then ignore any characters outside the acceptable ranges. This works most of the time... ... For when it doesn’t I’ve also had to add a lookup table. As mentioned above, some characters don’t map to a low ASCII value when normalised. Rather than drop these I’ve got a manual list of exceptions that is doubtless full of holes, but it is better than nothing. The normalisation code was inspired by Jon Hanna’s great post in Stack Overflow question How can I remove accents on a string?. The case conversion is now also optional. public static class Slug { public static string Create(bool toLower, params string[] values) { return Create(toLower, String.Join("-", values)); } /// <summary> /// Creates a slug. /// References: /// http://www.unicode.org/reports/tr15/tr15-34.html /// https://meta.stackexchange.com/questions/7435/non-us-ascii-characters-dropped-from-full-profile-url/7696#7696 /// https://stackoverflow.com/questions/25259/how-do-you-include-a-webpage-title-as-part-of-a-webpage-url/25486#25486 /// https://stackoverflow.com/questions/3769457/how-can-i-remove-accents-on-a-string /// </summary> /// <param name="toLower"></param> /// <param name="normalised"></param> /// <returns></returns> public static string Create(bool toLower, string value) { if (value == null) return ""; var normalised = value.Normalize(NormalizationForm.FormKD); const int maxlen = 80; int len = normalised.Length; bool prevDash = false; var sb = new StringBuilder(len); char c; for (int i = 0; i < len; i++) { c = normalised[i]; if ((c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) { if (prevDash) { sb.Append('-'); prevDash = false; } sb.Append(c); } else if (c >= 'A' && c <= 'Z') { if (prevDash) { sb.Append('-'); prevDash = false; } // Tricky way to convert to lowercase if (toLower) sb.Append((char)(c | 32)); else sb.Append(c); } else if (c == ' ' || c == ',' || c == '.' || c == '/' || c == '\\' || c == '-' || c == '_' || c == '=') { if (!prevDash && sb.Length > 0) { prevDash = true; } } else { string swap = ConvertEdgeCases(c, toLower); if (swap != null) { if (prevDash) { sb.Append('-'); prevDash = false; } sb.Append(swap); } } if (sb.Length == maxlen) break; } return sb.ToString(); } static string ConvertEdgeCases(char c, bool toLower) { string swap = null; switch (c) { case 'ı': swap = "i"; break; case 'ł': swap = "l"; break; case 'Ł': swap = toLower ? "l" : "L"; break; case 'đ': swap = "d"; break; case 'ß': swap = "ss"; break; case 'ø': swap = "o"; break; case 'Þ': swap = "th"; break; } return swap; } }

关于更多的细节,单元测试,以及为什么Facebook的URL方案比堆栈溢出更聪明的解释,我在我的博客上有一个扩展版本。

您可以使用以下帮助器方法。它可以转换Unicode字符。

public static string ConvertTextToSlug(string s)
{
    StringBuilder sb = new StringBuilder();

    bool wasHyphen = true;

    foreach (char c in s)
    {
        if (char.IsLetterOrDigit(c))
        {
            sb.Append(char.ToLower(c));
            wasHyphen = false;
        }
        else
            if (char.IsWhiteSpace(c) && !wasHyphen)
            {
                sb.Append('-');
                wasHyphen = true;
            }
    }

    // Avoid trailing hyphens
    if (wasHyphen && sb.Length > 0)
        sb.Length--;

    return sb.ToString().Replace("--","-");
}