代码之家  ›  专栏  ›  技术社区  ›  Dmitry Poroh

erlang二进制到小写性能

  •  0
  • Dmitry Poroh  · 技术社区  · 6 年前

    我的目标是加速将纯ascii二进制转换为小写的性能。我不需要英语以外的任何语言。我写了一些变体并进行了比较:

    二元理解:

    binary_comprehension(Binary) ->
        << <<if
                 C >= $A andalso C =< $Z -> C - $A + $a;
                 true -> C
             end >>
           || <<C>> <= Binary >>.
    

    列表理解:

    list_comprehension(Binary) ->
        L = binary_to_list(Binary),
        Lower =
            [if
                 C >= $A andalso C =< $Z -> C - $A + $a;
                 true -> C
             end || C <- L],
        list_to_binary(Lower).
    

    常规字符串:小写。

    令人惊讶的是,列表理解力胜过所有其他人:

    1> timer:tc(fun() -> lists:foreach(fun(_) -> tolower:list_comprehension(<<"QWEQWEIQEKQHWKEHKQWHEKQHWKEQWEKHQWLKL">>) end, L100000) end).
    {267603,ok}
    
    2> timer:tc(fun() -> lists:foreach(fun(_) -> tolower:binary_comprehension(<<"QWEQWEIQEKQHWKEHKQWHEKQHWKEQWEKHQWLKL">>) end, L100000) end).
    {324383,ok}
    
    3> timer:tc(fun() -> lists:foreach(fun(_) -> string:lowercase(<<"QWEQWEIQEKQHWKEHKQWHEKQHWKEQWEKHQWLKL">>) end, L100000) end).
    {319819,ok}
    

    你知道为什么双表转换+理解比二进制转换快得多吗?

    也许你知道更强大的优化?

    更新:

    我还发现字符串的char版本列表也很快:

    string_lowercase(Binary) ->
        L = binary_to_list(Binary),
        Lower = string:lowercase(L),
        list_to_binary(Lower).
    

    运行:

    39> timer:tc(fun() -> lists:foreach(fun(_) -> tolower:string_to_lower(<<"QWEQWEIQEKQHWKEHKQWHEKQHWKEQWEKHQWLKL">>) end, L100000) end).
    {277766,ok}
    
    0 回复  |  直到 6 年前
        1
  •  0
  •   Alexei K    6 年前

    我对代码做了一些修改并修改了测试用例。测试更改不是强制性的,但我个人更喜欢这样:

    -module(tolower).
    -compile(export_all).
    
    u2l(C) when C >= $A andalso C =< $Z -> C + 32;
    u2l(C) -> C.
    
    binary_comprehension(Binary) ->
      << << (u2l(C)) >> || <<C>> <= Binary >>.
    
    list_comprehension(Binary) ->
      list_to_binary([u2l(C) || C <- binary_to_list(Binary)]).
    
    list_recur(Binary) -> list_recur(binary_to_list(Binary), []).
    
    list_recur([], Result) -> lists:reverse(Result);
    list_recur([C | Tail], Result) when C >= $A andalso C =< $Z ->
      list_recur(Tail, [(C + 32) | Result]);
    list_recur([C | Tail], Result) ->
      list_recur(Tail, [C | Result]).
    
    string_to_lower(Binary) ->
      list_to_binary(string:lowercase(binary_to_list(Binary))).
    
    test() ->
      L100000 = lists:seq(1, 100000),
      TL0 = <<"QWEQWEIQEKQHWKEHKQWHEKQHWKEQWEKHQWLKL">>,
      TL = binary:copy(TL0, 100000),
      {R0, _} = timer:tc(fun() -> lists:foreach(fun(_) -> tolower:binary_comprehension(TL0) end, L100000) end),
      {R1, _} = timer:tc(tolower, binary_comprehension, [TL]),
      {R2, _} = timer:tc(tolower, list_comprehension, [TL]),
      {R3, _} = timer:tc(tolower, list_recur, [TL]),
      {R4, _} = timer:tc(string, lowercase, [TL]),
      {R5, _} = timer:tc(tolower, string_to_lower, [TL]),
      io:format("~n1.binary_comprehension = ~10w~n2.binary_comprehension = ~10w~n3.  list_comprehension = ~10w~n4.          list_recur = ~10w~n5.           lowercase = ~10w~n6.     string_to_lower = ~10w~n", 
      [R0,R1,R2,R3,R4,R5]).
    

    erlang shell表明,由于系统的并发性,运行时间不一致。但最好的时机是像预期的那样进行二元理解。

    62> c(tolower).    
    tolower.erl:2: Warning: export_all flag enabled - all functions will be exported
    {ok,tolower}
    63> l(tolower).    
    {module,tolower}
    64> tolower:test().
    
    1.binary_comprehension =     109000
    2.binary_comprehension =      94000
    3.  list_comprehension =     312001
    4.          list_recur =     344001
    5.           lowercase =     469002
    6.     string_to_lower =     218000
    ok
    65> tolower:test().
    
    1.binary_comprehension =     140998
    2.binary_comprehension =      93999
    3.  list_comprehension =     327994
    4.          list_recur =     296996
    5.           lowercase =     155997
    6.     string_to_lower =     280996
    ok
    66> tolower:test().
    
    1.binary_comprehension =     124998
    2.binary_comprehension =      93998
    3.  list_comprehension =     327995
    4.          list_recur =     296995
    5.           lowercase =     452993
    6.     string_to_lower =     202997
    ok
    67> tolower:test().
    
    1.binary_comprehension =     125000
    2.binary_comprehension =      94000
    3.  list_comprehension =     312000
    4.          list_recur =     282000
    5.           lowercase =     171000
    6.     string_to_lower =     266000
    ok
    

    第5行的时间与第6行的时间不同,因为当您使用二进制参数调用string:lowercase/1时,它被处理为utf8序列。当使用字符串参数utf8调用string:lowercase/1时,将避免处理。有关详细信息,请参见otp中的string.erl代码。