From: Branimir Maksimovic on
On Fri, 12 Mar 2010 08:14:30 +0100
Branimir Maksimovic <bmaxa(a)> wrote:

I had wrong count when sorting frequencies. Didn;t
show up as when there is only one element in a raw, wasn;t issue.
Used SSE2 to reduce pressure on stack, but there is no
noticeable difference. Could be faster with SSE4 but then
wont work on anything bellow penryn ;)


Bug fix first:

macro frequencies1
local l1,l2,l3,e1
mov ecx,dword[hashtable.elements]
cmp ecx,0
jz e1
mov ebx,dword[]
cmp dword[ebx],0
jz l3
push ebx ecx
mov ebx,dword[ebx]
mov ecx,dword[ebx]

if 0
ccall printf,fmt6,ecx,dword[ebx+4]
end if

add ebx,4
sub dword[esp],ecx ; was bug, ecx wasnt counted
; when raw had more than 1
push ebx ecx
find dword[],sortedtable.elements,8,ebx
pop ecx ebx
add ebx,16
dec ecx
jnz l2
pop ecx ebx
and ecx,ecx ; no decrement here
jz e1
add ebx,4
jmp l1

sse2 version:

macro hash str,size
local l1,l2,l3
mov ecx,size
mov ebx,str
xor eax,eax
mov esi,16
mov edi,2
shl eax,2
movzx edx,byte [ebx]
or eax,edx
inc ebx
dec esi
jnz l2
pslldq xmm1,4
movdqa xmm2,xmm1
movd xmm1,eax
por xmm1,xmm2
xor eax,eax
mov esi,16
dec edi
dec ecx
jnz l1
pslldq xmm1,4
movdqa xmm2,xmm1
movd xmm1, eax
por xmm1,xmm2
dec edi
jnz l3

macro hashfind data,elements,block,srchstr,srchlen
mov eax,srchstr
movd xmm0,eax
hash srchstr,srchlen
mov ebx,data
strfind elements,block

macro strfind elements,block
local l1,l2,l3,l4,l5,s1,e1
movdqa xmm2,xmm1
psrldq xmm2,4 ; pextrd , sse4 ,faster but won;t work on amd
movd eax,xmm2
xor esi,esi
and eax,0x1ffff ; increase size of hashtable gain 1 sec aprox
shl eax,2
movd xmm2,eax
movd xmm3,ebx
cmp dword[ebx+eax],0
jne l3
; allocate
mov ebx,1
xor eax,eax
lock cmpxchg dword[sema],ebx ; test and set
and eax,eax
jnz s1
add esi,20
movd eax,xmm2
movd ebx,xmm3
ccall realloc,dword[ebx+eax],esi ; realloc is not thread safe
lock and dword[sema],0 ; reset
mov esi,eax
and esi,esi
jz e2
movd eax,xmm2
movd ebx,xmm3
cmp dword[ebx+eax],0
mov dword[ebx+eax],esi
jne l2
mov esi, dword[ebx+eax]
mov dword[esi],0
mov ebx,dword[ebx+eax]
add ebx,4
mov eax,dword[ebx-4]
imul eax,16
mov dword[ebx+eax],0
movd dword[ebx+eax+4],xmm0
movq [ebx+eax+8],xmm1
inc dword[elements]
inc dword[ebx-4]
jmp e1
mov ebx,dword[ebx+eax]
add ebx,4
xor eax,eax

mov esi,dword[ebx-4]
imul esi,16
cmp eax,esi
jge l1 ; we need to reallocate
movq xmm4,[ebx+eax+8]
pcmpeqd xmm4,xmm1 ; with sse4.1 can be done faster pcmpeqq

movd esi,xmm4
and esi,esi
jz l5
psrldq xmm4,4
movd esi,xmm4
and esi,esi
jnz e1

add eax,16
jmp l4
lea eax,[ebx+eax]


Sometimes online sometimes not