我想匹配的只是一个URL的根,而不是一个文本字符串的整个URL。考虑到:

http://www.youtube.com/watch?v=ClkQA2Lb_iE
http://youtu.be/ClkQA2Lb_iE
http://www.example.com/12xy45
http://example.com/random

我想让最后2个实例解析到www.example.com或example.com域。

我听说正则表达式很慢,这将是我在页面上的第二个正则表达式,所以如果有办法做到没有正则表达式,请告诉我。

我正在寻找这个解决方案的JS/jQuery版本。


当前回答

不需要解析字符串,只需将URL作为参数传递给URL构造函数:

const url = 'http://www.youtube.com/watch?v=ClkQA2Lb_iE';
const { hostname } = new URL(url);

console.assert(hostname === 'www.youtube.com');

其他回答

function hostname(url) {
    var match = url.match(/:\/\/(www[0-9]?\.)?(.[^/:]+)/i);
    if ( match != null && match.length > 2 && typeof match[2] === 'string' && match[2].length > 0 ) return match[2];
}

上面的代码将成功解析以下示例url的主机名: http://WWW.first.com/folder/page.html first.com http://mail.google.com/folder/page.html mail.google.com https://mail.google.com/folder/page.html mail.google.com http://www2.somewhere.com/folder/page.html?q=1 somewhere.com https://www.another.eu/folder/page.html?q=1 another.eu

原文出处:http://www.primaryobjects.com/CMS/Article145

试试这个:

var matches = url.match(/^https?\:\/\/([^\/?#]+)(?:[\/?#]|$)/i);
var domain = matches && matches[1];  // domain will be null if no match is found

如果你想从结果中排除端口,请使用下面的表达式:

/^https?\:\/\/([^\/:?#]+)(?:[\/:?#]|$)/i

编辑:要防止特定域匹配,请使用反向前向。(? ! youtube.com)

/^https?\:\/\/(?!(?:www\.)?(?:youtube\.com|youtu\.be))([^\/:?#]+)(?:[\/:?#]|$)/i
String.prototype.trim = function(){return his.replace(/^\s+|\s+$/g,"");}
function getHost(url){
    if("undefined"==typeof(url)||null==url) return "";
    url = url.trim(); if(""==url) return "";
    var _host,_arr;
    if(-1<url.indexOf("://")){
        _arr = url.split('://');
        if(-1<_arr[0].indexOf("/")||-1<_arr[0].indexOf(".")||-1<_arr[0].indexOf("\?")||-1<_arr[0].indexOf("\&")){
            _arr[0] = _arr[0].trim();
            if(0==_arr[0].indexOf("//")) _host = _arr[0].split("//")[1].split("/")[0].trim().split("\?")[0].split("\&")[0];
            else return "";
        }
        else{
            _arr[1] = _arr[1].trim();
            _host = _arr[1].split("/")[0].trim().split("\?")[0].split("\&")[0];
        }
    }
    else{
        if(0==url.indexOf("//")) _host = url.split("//")[1].split("/")[0].trim().split("\?")[0].split("\&")[0];
        else return "";
    }
    return _host;
}
function getHostname(url){
    if("undefined"==typeof(url)||null==url) return "";
    url = url.trim(); if(""==url) return "";
    return getHost(url).split(':')[0];
}
function getDomain(url){
    if("undefined"==typeof(url)||null==url) return "";
    url = url.trim(); if(""==url) return "";
    return getHostname(url).replace(/([a-zA-Z0-9]+.)/,"");
}

代码:

var regex = /\w+.(com|co\.kr|be)/ig;
var urls = ['http://www.youtube.com/watch?v=ClkQA2Lb_iE',
            'http://youtu.be/ClkQA2Lb_iE',
            'http://www.example.com/12xy45',
            'http://example.com/random'];


$.each(urls, function(index, url) {
    var convertedUrl = url.match(regex);
    console.log(convertedUrl);
});

结果:

youtube.com
youtu.be
example.com
example.com

解析域——一个非常可靠的轻量级库

NPM安装解析域

const { fromUrl, parseDomain } = require("parse-domain");

示例1

parseDomain(fromUrl("http://www.example.com/12xy45"))
{ type: 'LISTED',
  hostname: 'www.example.com',
  labels: [ 'www', 'example', 'com' ],
  icann:
   { subDomains: [ 'www' ],
     domain: 'example',
     topLevelDomains: [ 'com' ] },
  subDomains: [ 'www' ],
  domain: 'example',
  topLevelDomains: [ 'com' ] }

示例2

parseDomain(fromUrl("http://subsub.sub.test.ExAmPlE.coM/12xy45"))
{ type: 'LISTED',
  hostname: 'subsub.sub.test.example.com',
  labels: [ 'subsub', 'sub', 'test', 'example', 'com' ],
  icann:
   { subDomains: [ 'subsub', 'sub', 'test' ],
     domain: 'example',
     topLevelDomains: [ 'com' ] },
  subDomains: [ 'subsub', 'sub', 'test' ],
  domain: 'example',
  topLevelDomains: [ 'com' ] }

Why?

Depending on the use case and volume I strongly recommend against solving this problem yourself using regex or other string manipulation means. The core of this problem is that you need to know all the gtld and cctld suffixes to properly parse url strings into domain and subdomains, these suffixes are regularly updated. This is a solved problem and not one you want to solve yourself (unless you are google or something). Unless you need the hostname or domain name in a pinch don't try and parse your way out of this one.