如何从字符串中删除所有非字母的字符?

非字母数字呢?

这必须是一个自定义函数还是也有更通用的解决方案?


当前回答

信不信由你,在我的系统中,这个丑陋的函数比G masters的优雅函数表现得更好。

CREATE FUNCTION dbo.RemoveSpecialChar (@s VARCHAR(256)) 
RETURNS VARCHAR(256) 
WITH SCHEMABINDING
    BEGIN
        IF @s IS NULL
            RETURN NULL
        DECLARE @s2 VARCHAR(256) = '',
                @l INT = LEN(@s),
                @p INT = 1

        WHILE @p <= @l
            BEGIN
                DECLARE @c INT
                SET @c = ASCII(SUBSTRING(@s, @p, 1))
                IF @c BETWEEN 48 AND 57
                   OR  @c BETWEEN 65 AND 90
                   OR  @c BETWEEN 97 AND 122
                    SET @s2 = @s2 + CHAR(@c)
                SET @p = @p + 1
            END

        IF LEN(@s2) = 0
            RETURN NULL

        RETURN @s2

其他回答

CREATE FUNCTION remove_spc_char(@str VARCHAR(MAX))
  RETURNS VARCHAR(MAX) 
AS
BEGIN
  DECLARE @resp    VARCHAR(MAX) = '';
  DECLARE @str_val   VARCHAR(MAX) = UPPER(@str);
  DECLARE @i       INTEGER= 1;
  DECLARE @v_asc   INTEGER;
   WHILE @i <= (LEN(@str_val))
   BEGIN
     SET @v_asc = (ASCII(SUBSTRING(@str_val, @i, 1))) 
        BEGIN
        IF @v_asc in (192,193,194,195,196,65) 
            begin
                SET @v_asc = 65;
                SET @resp = concat(@resp, CHAR(@v_asc));
            end;
        IF @v_asc in (200,201,202,203,233,69)
            begin
                SET @v_asc = 69;
                SET @resp = concat(@resp, CHAR(@v_asc));
            end;
        IF @v_asc in (204,205,206,207,296,73)
            begin
                SET @v_asc = 73;
                SET @resp = concat(@resp, CHAR(@v_asc));
            end;
        IF @v_asc in (210,211,212,213,214,79)
            begin
                SET @v_asc = 79;
                SET @resp = concat(@resp, CHAR(@v_asc));
            end;
        IF @v_asc in (217,218,219,220,85)
            begin
                SET @v_asc = 85;
                SET @resp = concat(@resp, CHAR(@v_asc));
            end;
        IF @v_asc in (199,231,67)
            begin
                SET @v_asc = 67;
                SET @resp = concat(@resp, CHAR(@v_asc));
            end;
        IF @v_asc in (209,78)
            begin
                SET @v_asc = 78;
                SET @resp = concat(@resp, CHAR(@v_asc));
            end;
        IF @v_asc in (924,181,358,216,222,330,272,208,198,42,37,38,34,36,35,
64,33,39,41,40,43,61,95,45,62,60,63,47,176,183,124,166,174,359,248,254,
180,170,186,126,312,331,273,172,178,179,163,162,123,91,93,125,92,167,240,
223,230,171,187,169,185,168)
            begin
                SET @resp = concat(@resp, '');
            end;
        ELSE 
            begin
                if @v_asc not in (65,67,69,73,78,79,85)
                begin
                    SET @resp = concat(@resp, CHAR(@v_asc));
                end;
            end;
        END;
      SET @i = @i + 1
    END;
    RETURN @resp;
END;

如果您像我一样,不能仅向生产数据添加函数,但仍然想执行这种过滤,那么这里有一个纯SQL解决方案,使用PIVOT表将过滤后的部分重新组合在一起。

注意:我硬编码表高达40个字符,如果你有更长的字符串要过滤,你将不得不添加更多。

SET CONCAT_NULL_YIELDS_NULL OFF;

with 
    ToBeScrubbed
as (
    select 1 as id, '*SOME 222@ !@* #* BOGUS !@*&! DATA' as ColumnToScrub
),

Scrubbed as (
    select 
        P.Number as ValueOrder,
        isnull ( substring ( t.ColumnToScrub , number , 1 ) , '' ) as ScrubbedValue,
        t.id
    from
        ToBeScrubbed t
        left join master..spt_values P
            on P.number between 1 and len(t.ColumnToScrub)
            and type ='P'
    where
        PatIndex('%[^a-z]%', substring(t.ColumnToScrub,P.number,1) ) = 0
)

SELECT
    id, 
    [1]+ [2]+ [3]+ [4]+ [5]+ [6]+ [7]+ [8] +[9] +[10]
    +  [11]+ [12]+ [13]+ [14]+ [15]+ [16]+ [17]+ [18] +[19] +[20]
    +  [21]+ [22]+ [23]+ [24]+ [25]+ [26]+ [27]+ [28] +[29] +[30]
    +  [31]+ [32]+ [33]+ [34]+ [35]+ [36]+ [37]+ [38] +[39] +[40] as ScrubbedData
FROM (
    select 
        *
    from 
        Scrubbed
    ) 
    src
    PIVOT (
        MAX(ScrubbedValue) FOR ValueOrder IN (
        [1], [2], [3], [4], [5], [6], [7], [8], [9], [10],
        [11], [12], [13], [14], [15], [16], [17], [18], [19], [20],
        [21], [22], [23], [24], [25], [26], [27], [28], [29], [30],
        [31], [32], [33], [34], [35], [36], [37], [38], [39], [40]
        )
    ) pvt

使用CTE生成的数字表来检查每个字符,然后FOR XML连接到一个保留值的字符串,您可以…

CREATE FUNCTION [dbo].[PatRemove](
    @pattern varchar(50),
    @expression varchar(8000) 
    )
RETURNS varchar(8000)
AS
BEGIN
    WITH 
        d(d) AS (SELECT d FROM (VALUES (0),(1),(2),(3),(4),(5),(6),(7),(8),(9)) digits(d)),
        nums(n) AS (SELECT ROW_NUMBER() OVER (ORDER BY (SELECT NULL)) FROM d d1, d d2, d d3, d d4),
        chars(c) AS (SELECT SUBSTRING(@expression, n, 1) FROM nums WHERE n <= LEN(@expression))
    SELECT 
        @expression = (SELECT c AS [text()] FROM chars WHERE c NOT LIKE @pattern FOR XML PATH(''));

    RETURN @expression;
END

SQL Server >= 2017…

declare @text varchar(max)

-- create some sample text
select
@text=
'
Lorem @ipsum  *&dolor-= sit?! amet, {consectetur } adipiscing\ elit. Vivamus commodo justo metus, sed facilisis ante 
congue eget. Proin ac bibendum sem/.
'

-- the characters to be removed
declare @unwanted varchar(max)='''.,!?/<>"[]{}|`~@#$%^&*()-+=/\:;'+char(13)+char(10)

-- interim replaced with
declare @replace_with char(1)=' '

-- call the translate function that will change unwanted characters to spaces
-- in this sample
declare @translated varchar(max)
select @translated=TRANSLATE(@text,@unwanted,REPLICATE(@replace_with,len(@unwanted)))

-- In this case, I want to preserve one space
select  string_agg(trim(value),' ')
from    STRING_SPLIT(@translated,' ')
where   trim(value)<>''

-- Result
'Lorem ipsum dolor sit amet consectetur adipiscing elit Vivamus commodo justo metus sed facilisis ante congue eget Proin ac bibendum sem'

Here's a solution that doesn't require creating a function or listing all instances of characters to replace. It uses a recursive WITH statement in combination with a PATINDEX to find unwanted chars. It will replace all unwanted chars in a column - up to 100 unique bad characters contained in any given string. (E.G. "ABC123DEF234" would contain 4 bad characters 1, 2, 3 and 4) The 100 limit is the maximum number of recursions allowed in a WITH statement, but this doesn't impose a limit on the number of rows to process, which is only limited by the memory available. If you don't want DISTINCT results, you can remove the two options from the code.

-- Create some test data:
SELECT * INTO #testData 
FROM (VALUES ('ABC DEF,K.l(p)'),('123H,J,234'),('ABCD EFG')) as t(TXT)

-- Actual query:
-- Remove non-alpha chars: '%[^A-Z]%'
-- Remove non-alphanumeric chars: '%[^A-Z0-9]%'
DECLARE @BadCharacterPattern VARCHAR(250) = '%[^A-Z]%';

WITH recurMain as (
    SELECT DISTINCT CAST(TXT AS VARCHAR(250)) AS TXT, PATINDEX(@BadCharacterPattern, TXT) AS BadCharIndex
    FROM #testData
    UNION ALL
    SELECT CAST(TXT AS VARCHAR(250)) AS TXT, PATINDEX(@BadCharacterPattern, TXT) AS BadCharIndex
    FROM (
        SELECT 
            CASE WHEN BadCharIndex > 0 
                THEN REPLACE(TXT, SUBSTRING(TXT, BadCharIndex, 1), '')
                ELSE TXT 
            END AS TXT
        FROM recurMain
        WHERE BadCharIndex > 0
    ) badCharFinder
)
SELECT DISTINCT TXT
FROM recurMain
WHERE BadCharIndex = 0;