代码之家  ›  专栏  ›  技术社区  ›  Johan

为什么德尔福会在一个不知名的地方插入nop?

  •  8
  • Johan  · 技术社区  · 11 年前

    以下代码:

    while Assigned(p) do begin
      np:= p^.next;
      h:= leaf_hash(p^.data);   <<-- inline routine
      h:= h mod nhashprime;
      p^.next:= nhashtab[h]; 
      nhashtab[h]:= p;
      p:= np;
    end; { while }
    

    生成以下部件:

    hlife.pas.605: h:= leaf_hash(p^.data);
    00000000005D4602 498B4018         mov rax,[r8+$18]
    00000000005D4606 48C1E830         shr rax,$30
    00000000005D460A 498B5018         mov rdx,[r8+$18]
    00000000005D460E 48C1EA20         shr rdx,$20
    00000000005D4612 81E2FFFF0000     and edx,$0000ffff
    00000000005D4618 4D8B5818         mov r11,[r8+$18]
    00000000005D461C 49C1EB10         shr r11,$10
    00000000005D4620 4181E3FFFF0000   and r11d,$0000ffff
    00000000005D4627 418B7018         mov esi,[r8+$18]
    00000000005D462B 81E6FFFF0000     and esi,$0000ffff
    00000000005D4631 488D34F6         lea rsi,[rsi+rsi*8]
    00000000005D4635 4403DE           add r11d,esi
    00000000005D4638 4F8D1CDB         lea r11,[r11+r11*8]
    00000000005D463C 4103D3           add edx,r11d
    00000000005D463F 488D14D2         lea rdx,[rdx+rdx*8]
    00000000005D4643 03C2             add eax,edx
    hlife.pas.606: h:= h mod nhashprime;
    00000000005D4645 8BC0             mov eax,eax   <<--- Why is there a NOP here?
    00000000005D4647 4C63DB           movsxd r11,rbx
    00000000005D464A 4899             cwd
    00000000005D464C 49F7FB           idiv r11
    00000000005D464F 488BC2           mov rax,rdx
    hlife.pas.607: p^.next:= nhashtab[h];
    00000000005D4652 488B5538         mov rdx,[rbp+$38]
    

    Delphi在 nhashtab[h]:= p; 线 如果 leaf_hash 如果函数是一个正常函数,那么它就有意义了。
    (不,并不是因为RET仍将返回到[5D4645]执行nop)
    但现在这不是一个跳跃目标。

    所以我(只是)很好奇,它为什么会这样做?

    [编辑]:SSCCE
    好的,我有一个SSCCE,它不是很短,但必须要做。

    注意编译器设置(调试+Win64)

    enter image description here

    unit Unit16;
    
    interface
    
    uses
      Winapi.Windows, Winapi.Messages, System.SysUtils, System.Variants, System.Classes, Vcl.Graphics,
      Vcl.Controls, Vcl.Forms, Vcl.Dialogs, Vcl.StdCtrls;
    
    type
      pnode = ^node;
      tflavour = (tnode, tleaf, tleaf64);
    
      node = record
        case flavour: tflavour of
          tnode: (next: pnode; (* hash link *)
              nw, ne, sw, se: pnode; (* constant; nw not= 0 means nonleaf *)
              res: pnode); (* cache *)
          tleaf: (next1: pnode; (* hash link *)
              isnode: pnode; (* must always be zero for leaves *)
              nw1, ne1, sw1, se1: word; (* constant *)
              res1, res2: word; (* constant *)
            );
          tleaf64: (next2: pnode; (* hash link *)
              isnode1: pnode; (* must always be zero for leaves *)
              data: Uint64; (* constant *)
              res1a, res2a: word; (* constant *)
            )
      end;
    
      ppnode = array of pnode;
    
      THashBox = class(TPersistent)
      strict private
        leafhashpop: integer;
        leafhashlimit: integer;
        leafhashprime: integer;
        leafhashtab: ppnode;
        nodehashpop: integer;
        nodehashlimit: integer;
        nodehashprime: integer;
        nodehashtab: ppnode;
      private
        TotalTime, Occurrences: Uint64;
        StartTime, EndTime: Uint64;
        procedure resize_leaves();
      public
        constructor Create;
      end;
    
      TForm16 = class(TForm)
        Button1: TButton;
        procedure Button1Click(Sender: TObject);
      private
        HashBox: THashBox;
      public
      end;
    
    var
      Form16: TForm16;
    
    implementation
    
    {$R *.dfm}
    
    const
      maxmem = 2000*1000*1000;   {2GB}
    
    var
      alloced: cardinal;
    
    function rdtsc: int64; assembler;
    asm
      { xor eax,eax;
      push rbx
      cpuid
      pop rbx }
      rdtsc
    end;
    
    function node_hash(n: pnode): cardinal; { inline; } assembler; overload;
    // var
    // a: pnativeint;
      // begin
      // Result:= nativeint(n^.se) + 3 * (nativeint(n^.sw) + 3 * (nativeint(n^.ne) + 3 * nativeint(n^.nw) + 3));
    asm
      mov eax,[rcx+node.nw]
      lea eax,[eax+eax*2+3]
      add eax,[rcx+node.ne]
      lea eax,[eax+eax*2]
      add eax,[rcx+node.sw]
      lea eax,[eax+eax*2]
      add eax,[rcx+node.se]
    end;
    
    function leaf_hash(a, b, c, d: cardinal): cardinal; inline; overload;
    begin
      Result:= (d + 9 * (c + 9 * (b + 9 * a)))
    end;
    
    function leaf_hash(data: Uint64): cardinal; inline; overload;
    begin
      // Result:= d + 9 * (c + 9 * (b + 9 * a));
      Result:= ((data shr 48) + 9 * (((data shr 32) and $FFFF) + 9 * (((data shr 16) and $FFFF) + 9 * (data and $FFFF))));
      Inc(Result);
    end;
    
    procedure TForm16.Button1Click(Sender: TObject);
    begin
      HashBox:= THashBox.Create;
      Hashbox.resize_leaves;
    end;
    
    function nextprime(old: integer): integer;
    begin
      Result:= 1009;
    end;
    
    constructor THashBox.Create;
    begin
      leafhashprime:= 7;
      SetLength(leafhashtab, leafhashprime);
    end;
    
    procedure THashBox.resize_leaves();
    var
      i, i1, i2: integer;
      nhashprime: Cardinal;
      p: pnode;
      nhashtab: ppnode;
      np: pnode;
      h: Integer;
      n, n2: integer;
      diff1, diff2: integer;
    begin
      nhashprime:= nextprime(4 * leafhashprime);
      if (nhashprime * sizeof(pnode) > maxmem - alloced) then begin
        leafhashlimit:= 2000 * 1000 * 1000;
        exit;
      end;
      (*
        *   Don't let the hash table buckets take more than 4% of the
        *   memory.  If we're starting to strain memory, let the buckets
        *   fill up a bit more.
      *)
      if (nhashprime > maxmem div 100) then begin
        nhashprime:= nextprime(maxmem div 100);
        if (nhashprime = leafhashprime) then begin
          leafhashlimit:= 2000 * 1000 * 1000;
          exit;
        end;
      end;
      SetLength(nhashtab, nhashprime); //make a new table, do not resize the existing one.
      alloced:= alloced + sizeof(pnode) * (nhashprime - leafhashprime);
    
      diff1:= maxint;
      for i1:= 0 to 100 do begin
        n:= 0;
        StartTime:= rdtsc;
        for i:= 0 to leafhashprime - 1 do begin
          p:= leafhashtab[i];
          if Assigned(p) then begin
            h:= node_hash(p);
            h:= h mod nhashprime;
            inc(n, h);
          end;
        end;
        EndTime:= rdtsc;
        if ((EndTime - StartTime) < diff1) then diff1:= (EndTime - StartTime);
    
      end;
    
      diff2:= maxint;
      for i1:= 0 to 100 do begin
        n2:= 0;
        StartTime:= rdtsc;
        for i:= 0 to leafhashprime - 1 do begin
          p:= leafhashtab[i];
          if Assigned(p) then begin
            inc(n2);
          end;
        end;
        EndTime:= rdtsc;
        if (endtime - starttime) < diff2 then diff2:= endtime - starttime;
      end;
    
      TotalTime:= diff1 - diff2;
      if n <> n2 then Occurrences:= nhashprime;
    
      for i:= 0 to leafhashprime - 1 do begin
        // for (p=hashtab[i]; p;) {
        p:= leafhashtab[i];
        while Assigned(p) do begin    <<--- put a breakpoint here
          np:= p^.next;
          h:= leaf_hash(p^.data);
          h:= h mod nhashprime;
          p^.next:= nhashtab[h];
          nhashtab[h]:= p;
          p:= np;
        end; { while }
      end; { for i }
      // free(hashtab);
      leafhashtab:= nhashtab;
      leafhashprime:= nhashprime;
      leafhashlimit:= leafhashprime;
    end;
    
    end.
    

    您将看到此拆解:

    Unit16.pas.196: h:= h mod nhashprime;
    000000000059CE4B 4863C0           movsxd rax,rax
    000000000059CE4E 448B5528         mov r10d,[rbp+$28]
    000000000059CE52 458BD2           mov r10d,r10d     <<--- weird NOP here 
    000000000059CE55 4899             cwd
    000000000059CE57 49F7FA           idiv r10
    000000000059CE5A 488BC2           mov rax,rdx
    Unit16.pas.197: p^.next:= nhashtab[h];
    000000000059CE5D 488B5538         mov rdx,[rbp+$38]
    
    3 回复  |  直到 11 年前
        1
  •  5
  •   Johan    11 年前

    答案是

    MOV EAX,EAX
    

    不是禁止手术

    在x64上,操作64位寄存器的低32位将使高32位为零。
    因此,上述说明实际上应理解为:

    MOVZX RAX,EAX 
    

    根据AMD的说法

    32位运算的结果隐式地零扩展到64位值。

        2
  •  4
  •   Arnaud Bouchez    11 年前

    IMHO这不是 nop 对于alignment,但在我看来,这听起来就像是未优化的生成代码,以及您自己的变量的错误签名。

    h:= h mod nhashprime;
    

    可分为:

    mov eax,eax       new h = eax, old h = eax  // which does not mean anything
    movsxd r11,rbx    convert with sign nhashprime stored in rbx into temp registry r11
    cwd               signed rax into rdx:rax
    idiv r11          signed divide rdx:rax by r11 -> rax=quotient, rdx=remainder
    mov rax,rdx       store remainder rdx into rax
    

    您是否尝试启用代码生成优化?我想它会修复的内容 mov eax,eax .

    但您的原始代码也经过了次优化。 在您的情况下,应该使用无符号算术。

    而且,你最好使用二次方 nhashprime ,计算一个简单的 and 二进制运算而不是慢速除法:

    var h, nhashprimeMask: cardinal; // use UNSIGNED arithmetic here!
    
    // here nhashprime is a POWER OF TWO (128,256,512,1024,2048...)
    nhashprimeMask := nhashprime-1; // compute the mask
    
    while Assigned(p) do begin
      np:= p^.next;
      h:= leaf_hash(p^.data) and nhashprimeMask;
      p^.next:= nhashtab[h]; 
      nhashtab[h]:= p;
      p:= np;
    end; { while }
    

    这段代码会更快,编译起来也会更好。

        3
  •  0
  •   Remy Lebeau    11 年前

    这是一种对对齐代码的优化,尤其是在循环中,以避免缓存行停滞等。