From: Branimir Maksimovic on
On Wed, 17 Mar 2010 07:49:16 +0100
Branimir Maksimovic <bmaxa(a)hotmail.com> wrote:

This is another finished version. Previous version used packed ACTG
string as hash but that wasn't enough, so added little bit twiddling
to get less collisions ;)
Improved sse2 compare function, used pmovmskb to employ single
compare instead of double compare. Could easily be extended
to support 64 chars nucleotides (then I have to align better
in order to get speed ;), but benchmark requires maximum
18, so support is for up to 32 now (because of that 1 sec ;)
First place c++ program uses trick to compress first
then process, but since benchmark site is not interested
to show assembler performance, I wont do that to get first
place ;)
C++ hash table is implemented as array of elements which
are linked list. Secret of better c++ hash table performance is that I
use array of pointers to arrays, while if there are no collisions
c++ is really fast. So in case there are
more collisions (c++ has fewer collisions, actually almost none)
my would be faster ;)

Note. C++ program, which I compare to, isn;t case insensitive
and I got it with assembler which performs type conversion,
not just store in memory!

Why Im doing this? To show that programming in assembler is
worth while , unlike popular belief ;)

-rw-r--r-- 1 bmaxa bmaxa 8811 2010-03-10 09:31 knucleotide.cpp
bmaxa(a)maxa:~/fasm/knucleotide$ ls -l knucleotidef.asm
-rw-r--r-- 1 bmaxa bmaxa 12649 2010-03-20 00:37 knucleotidef.asm
bmaxa(a)maxa:~/fasm/knucleotide$
Code Size : cpp program uses finished libraries while
asm one implements everything in this source (and I didn;t remove
commented code,some of it not used at all ;) )!

cpp programs use some non standard hash table which is
not present on older g++ implementations, while assembler
program requires only gcc to link object file ;)
so I couldn't run c++ program on servers, with older
g++ ;)

code size:
bmaxa(a)maxa:~/fasm/knucleotide$ ls -l knucleotide
-rwxr-xr-x 1 bmaxa bmaxa 13904 2010-03-20 00:57 knucleotide
bmaxa(a)maxa:~/fasm/knucleotide$ ls -l knucleotidecpp
-rwxr-xr-x 1 bmaxa bmaxa 158624 2010-03-20 01:18 knucleotidecpp

cpp executable is 10 times larger, even I used macros!

speed:
bmaxa(a)maxa:~/fasm/knucleotide$ time ./knucleotide <~/long-input.txt
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758 GGT
446535 GGTA
47336 GGTATT
893 GGTATTTTAATT
893 GGTATTTTAATTTATAGT

real 0m10.390s
user 0m18.300s
sys 0m0.140s
bmaxa(a)maxa:~/fasm/knucleotide$ time ./knucleotidecpp <~/long-input.txt
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758 GGT
446535 GGTA
47336 GGTATT
893 GGTATTTTAATT
893 GGTATTTTAATTTATAGT

real 0m10.220s
user 0m19.610s
sys 0m0.140s
bmaxa(a)maxa:~/fasm/knucleotide$

User time for asm is one second less even c++ is compiled for maximum
optimizations and does not performs to upper or lower on input file!

Source follows. I spent some time in doing this, it's not generic
enough but can easily be ;)

Greets!

struc vector d,s
{
.data dd d
.size dd s
.elements dd 0
}

macro ccall proc,[arg] ; call CDECL procedure
{
common
local size
size = 0
reverse
pushd arg
size = size+4
common
call proc
add esp,size
}

macro sys_exit rc
{
mov eax,1 ; exit
mov ebx,rc
int 0x80
}

macro sys_read fd, buf, size
{
mov eax, 3 ; sys_read
mov ebx, fd
mov ecx, buf
mov edx, size
int 0x80
}
macro sys_write fd, buf, size
{
mov eax, 4 ; sys_write
mov ebx, fd
mov ecx, buf
mov edx, size
int 0x80
}

CLONE_VM equ 0x00000100
CLONE_FS equ 0x00000200
CLONE_FILES equ 0x00000400
CLONE_SIGHAND equ 0x00000800
CLONE_THREAD equ 0x00010000

macro sys_clone stack
{
mov eax,120 ; sys_clone
mov ebx,CLONE_VM or CLONE_FS or CLONE_FILES \
or CLONE_SIGHAND;
mov ecx,stack ; choose stack
xor edx,edx ; no struct
int 0x80
}

__WNOTHREAD equ 0x20000000
__WALL equ 0x40000000
__WCLONE equ 0x80000000

macro sys_wait pid
{
mov ebx,pid
mov eax,114 ; sys_wait4
xor ecx,ecx
xor esi,esi
mov edx,__WALL;
int 0x80
}

macro read fd, buf,size
{
local l1,l2,l3
mov eax, dword [fptr]
and eax,eax
jnz l2
l1:
sys_read fd,filebuf,fsize
and eax,eax
jz l3
lea eax, [eax+filebuf]
mov dword [fend], eax
mov dword [fptr], filebuf
l2:
mov ecx, size
mov ebx, size
mov eax, dword [fend]
sub eax, dword [fptr]
jz l1
cmp eax,ecx
cmovl ecx,eax
mov eax,ecx
sub ebx, ecx
strncpy buf,dword [fptr], ecx, 0
and ebx,ebx
mov dword [fptr],esi
jnz l1
l3:
}

macro getLine fd, buf, size
{
local l1,l2
mov ecx, size
mov edi, buf
l1:
and ecx,ecx
jz l2
push ecx
push edi
read fd,dword[esp],1
pop edi
pop ecx
cmp eax,1
jne l2;
dec ecx
inc edi
cmp byte [edi-1], 0xa
jnz l1
dec edi
l2:
mov byte [edi],0
}

macro dwordnset s, c, count
{
mov edi,s
mov eax,c
mov ecx,count
cld
rep stosd
}

macro strnset s,c, size
{
mov edi,s
mov eax,c
mov ecx,size
cld
rep stosb
}

macro dwordncmp s1, s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx,size
repe cmpsd
if dir
cld
end if
}

macro strncmp s1, s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx,size
repe cmpsb
if dir
cld
end if
}

macro dwordncpy s1,s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx, size
rep movsd
if dir
cld
end if
}

macro strncpy s1,s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx, size
rep movsb
if dir
cld
end if
}

macro to_num src
{
mov al,src
xlatb
}

macro to_char src
{
mov al,src
xlatb
}

macro pack_str dst,src,size,f
{
if ~f
local l1
mov esi,src
mov edi,dst
mov ecx,size
mov edx,0
mov ebx,xtbl
l1:
to_num byte [esi]
mov byte [edi], al
inc edi
inc esi
inc edx
dec ecx
jnz l1
else
strncpy dst,src,size,0
end if
}

macro really_pack_str dst,src,size,f
{
local l1,l2,e1
mov esi,src
mov edi,dst
mov ecx,size
mov edx,1
l1:
mov ebx,4
mov byte [edi],0
l2:
push ebx
mov ebx,xtbl
to_num byte [esi]
pop ebx
shl byte [edi],2
or byte [edi], al
inc esi
dec ecx
jz e1
dec ebx
jnz l2
inc edi
inc edx ; count
jmp l1
e1:
}

macro unpack_str dst,src,size
{
local l1
mov esi,src
mov edi,dst
mov ecx,size
mov ebx,xtbl
l1:
to_char byte [esi]
mov byte [edi], al
inc edi
inc esi
dec ecx
jnz l1
}

macro initvector data,oldsize,size,block
{
local e1,e2
mov eax, size
imul eax, block
push eax
ccall realloc,dword[data],eax
pop ebx
and eax,eax
jz e1
mov dword[data],eax
mov dword[oldsize],ebx
jmp e2
e1:
ccall perror, err1
sys_exit -1
e2:
}

macro freevector data,size
{
local l1
mov ecx,size
mov ebx,data
l1:
push ebx ecx
ccall free,dword[ebx]
pop ecx ebx
mov dword[ebx],0
add ebx,4
dec ecx
jnz l1
}

macro hash str,size
{
local l1,l2,e1
mov ecx,size
mov ebx,str
xor eax,eax
mov esi,16
mov edi,4
pxor xmm1,xmm1
l1:
shl eax,2
movzx edx,byte [ebx]
or eax,edx
inc ebx
dec esi
jnz l2
pslldq xmm1,4
movdqa xmm2,xmm1
movd xmm1,eax
por xmm1,xmm2
xor eax,eax
mov esi,16
dec edi
jz e1
l2:
dec ecx
jnz l1
pslldq xmm1,4
movdqa xmm2,xmm1
movd xmm1,eax
por xmm1,xmm2
dec edi
e1:
}

macro hashfind data,elements,block,srchstr,srchlen
{
mov eax,srchstr
movd xmm0,eax
hash srchstr,srchlen
mov ebx,data
strfind elements,block
}

macro strfind elements,block
{
local l1,l2,l3,l4,l5,s1,s2,s3,e1
movdqa xmm2,xmm1
mov edx,4
sub edx,edi
xor eax,eax
s2:
movd ecx,xmm2
shld eax,ecx,16
and ecx,0xffff
add eax,ecx
psrldq xmm2,4
dec edx
jnz s2
s3:
and eax,0x1ffff
xor esi,esi
shl eax,2
movd xmm2,eax
movd xmm3,ebx
cmp dword[ebx+eax],0
jne l3
l1:
; allocate
s1:
mov ebx,1
xor eax,eax
lock cmpxchg dword[sema],ebx ; test and set
and eax,eax
jnz s1
add esi,20
movd eax,xmm2
movd ebx,xmm3
ccall realloc,dword[ebx+eax],esi ; realloc is not thread safe
lock and dword[sema],0 ; reset
mov esi,eax
and esi,esi
jz e2
movd eax,xmm2
movd ebx,xmm3
cmp dword[ebx+eax],0
mov dword[ebx+eax],esi
jne l2
mov esi, dword[ebx+eax]
mov dword[esi],0
l2:
mov ebx,dword[ebx+eax]
add ebx,4
mov eax,dword[ebx-4]
shl eax,4
mov dword[ebx+eax],0
movd [ebx+eax+4],xmm0
movq [ebx+eax+8],xmm1
inc dword[elements]
inc dword[ebx-4]
jmp e1
;search
l3:
mov ebx,dword[ebx+eax]
add ebx,4
xor eax,eax

l4:
mov esi,dword[ebx-4]
shl esi,4
cmp eax,esi
jge l1 ; we need to reallocate
movq xmm4,[ebx+eax+8]
pcmpeqb xmm4,xmm1
pmovmskb esi,xmm4
xor si,0xffff
jz e1

l5:
add eax,16
jmp l4
e1:
lea eax,[ebx+eax]
}

macro find data,elements,block,srchstr
; binary search and insert
{

local l1,l2,e1,e2,e3
pushd data
mov ecx,srchstr
mov ebx,dword[esp]
add ebx,4
mov eax,dword[ebx-4]
lea edx,[ebx+eax*block]
l1:

if 0
pusha
ccall printf,fmt1,dword[ebx-4]
popa
end if

and eax,eax
jz e1
cmp edx,ebx
jle e1
shr eax,1
mov esi,dword[ecx]
cmp dword[ebx+eax*block],esi
jle l1
and eax,eax
jnz l2
inc eax
l2:
lea ebx, [ebx+eax*block]
jmp l1
e1:
mov eax,dword[esp]
add eax,4
lea edx,[eax-4]
mov edx,dword[edx]
lea eax, [eax+edx*block]
mov edx, eax
add edx,block
dec edx
sub eax,ebx
jl e2
push ecx
lea ecx, [edx-block]

if 0
pusha
ccall printf,fmt5,eax,dword[ecx]
popa
end if

strncpy edx,ecx,eax,1
pop ecx

if 0
pusha
ccall printf,fmt5,eax,ecx
popa
end if

mov esi,dword[ecx]
mov dword [ebx],esi ; count
mov esi,dword[ecx+4]
mov dword [ebx+4],esi ;ptrstring
mov eax,dword[esp]
inc dword[eax]
inc dword[elements]
xor eax,eax
jmp e3
e2:
ccall printf,fmt,err2
sys_exit -1
e3:
pop ecx
lea eax,[ebx+eax*block]
}


macro print_strs ptr,size
{
local l1,e1
mov ebx,ptr
mov ecx,dword[ebx]
cmp ecx,0
jz e1
mov edx,dword[sdta]
inc edx
sub edx,size
push edx
fild dword[esp]
mov dword[esp],100
fild dword[esp]
add ebx,4
add esp,4
l1:
push ecx ebx
unpack_str lngbuf,dword[ebx+4],size
mov byte[lngbuf+size],0
pop ebx
push ebx
fild dword[ebx]
fmul st0,st1
fdiv st0,st2
sub esp,8
fstp qword[esp]
ccall printf,fmt, lngbuf
add esp,8
pop ebx ecx
add ebx,8
dec ecx
jnz l1
e1:
}

macro merge data,data1,size,srchlen
{
local l1,l2,l3,t1,e1
mov ecx,size/2
cmp ecx,0
jz e1
mov ebx,data1
l1:
cmp dword[ebx],0
jz l3
push ebx ecx
mov ebx,dword[ebx]
mov ecx,dword[ebx]

if 0
pusha
ccall printf,fmt6,ecx,dword[ebx+4]
popa
end if

add ebx,4
add dword[sum],ecx
inc dword[cnt]
l2:
push ebx ecx
hashfind data,hashtable.elements,8,dword[ebx+4],srchlen
dec dword[hashtable.elements]

pop ecx ebx
mov esi, dword[ebx]
add dword[eax],esi
add ebx,16
dec ecx
jnz l2
pop ecx ebx
l3:
add ebx,4
dec ecx
jnz l1
e1:
}

macro frequencies size
{
local l1,l2,l3,e1
mov edx,size
mov ebx,dword[dta]
mov ecx,dword[sdta]
inc ecx
sub ecx,edx ; loop count
pushd dword[hashtable.data]
pushd dword[hashtable.elements]
push ebx ecx edx

strncpy tstck-5*4,esp,5*4,0
strncpy tstck1-5*4,esp,5*4,0
dec dword[tstck1-4*4]
inc dword[tstck1-3*4]
add dword[tstck1-1*4],0x80000

sys_clone tstck
and eax,eax
jz l1
push eax
sys_clone tstck1
and eax,eax
jz l1
sys_wait eax
pop eax
sys_wait eax
pop edx ecx ebx esi esi
jmp e1
l1:
mov ebp,esp
sub esp,5*4

if 0
ccall printf,fmt1,ecx
ccall printf,fmt1,edx
ccall printf,fmt2,ebx
end if

l2:
pop edx ecx ebx
push ebp
mov ebp,esp
l3:
; push ebx ecx edx
movd xmm5,edx
movd xmm6,ecx
movd xmm7,ebx
hashfind dword [ebp+8], ebp+4,8,ebx,edx
; pop edx ecx ebx
movd edx,xmm5
movd ecx,xmm6
movd ebx,xmm7
inc dword[eax]
add ebx,2 ; interleave
sub ecx,2
jg l3
mov esp,ebp
pop ebp
pop esi
lock add dword[hashtable.elements],esi
pop esi
sys_exit 0
e1:

}

macro frequencies1
{
local l1,l2,l3,e1
mov ecx,dword[hashtable.elements]
cmp ecx,0
jz e1
mov ebx,dword[hashtable.data]
l1:
cmp dword[ebx],0
jz l3
push ebx ecx
mov ebx,dword[ebx]
mov ecx,dword[ebx]

if 0
pusha
ccall printf,fmt6,ecx,dword[ebx+4]
popa
end if

add ebx,4
sub dword[esp],ecx ; was bug, ecx wasnt counted
; when raw had more than 1
l2:
push ebx ecx
find dword[sortedtable.data],sortedtable.elements,8,ebx
pop ecx ebx
add ebx,16
dec ecx
jnz l2
pop ecx ebx
and ecx,ecx ; no decrement here
jz e1
l3:
add ebx,4
jmp l1
e1:
}

macro calc_frequencies size
{
mov edx,size
call cfrequencies
}

macro sort_frequencies size
{
call sfrequencies
}

STDIN equ 0
STDOUT equ 1
STDERR equ 2
fsize equ 8192
SIZE equ 0x40000 ; play with size and mask in strfind
format ELF

section '.text' executable

public main
extrn printf
extrn perror
extrn realloc
extrn free

cfrequencies:
frequencies edx
ret
sfrequencies:
frequencies1
ret

main:
; strnset xtbl,'A',256
mov ebx ,xtbl
mov byte[ebx+'a'],0
mov byte[ebx+'A'],0
mov byte[ebx+'c'],1
mov byte[ebx+'C'],1
mov byte[ebx+'g'],2
mov byte[ebx+'G'],2
mov byte[ebx+'t'],3
mov byte[ebx+'T'],3

mov byte[ebx+0],'A'
mov byte[ebx+1],'C'
mov byte[ebx+2],'G'
mov byte[ebx+3],'T'

l2:
getLine STDIN, buf, 256
movzx eax, byte [buf]
and eax,eax
jz e1
strncmp buf,three,6,0
and ecx,ecx
jnz l2
l1:
getLine STDIN, buf, 256
movzx eax, byte [buf]
and eax,eax
jz e1
cmp eax,'>'
je e1
mov eax,256
sub eax,ecx
dec eax
push eax
add eax, dword [sdta]
ccall realloc,dword[dta], eax
and eax,eax
jz e2
mov dword[dta],eax
pop eax
mov ebx, dword [sdta]
add ebx, dword [dta]
push eax
pack_str ebx,buf,eax,0
pop eax
add dword[sdta],edx
jmp l1
e1:
initvector hashtable.data,hashtable.size,SIZE,4
dwordnset dword[hashtable.data],0,SIZE

; sys_write STDOUT, dword [dta], dword[sdta]

; ccall printf,fmt1,dword[sdta]

macro write_frequencies len
{
calc_frequencies len
mov esi,dword[hashtable.data]
add esi,0x80000
merge dword[hashtable.data],esi,SIZE,len
; ccall printf,fmt1,dword[cnt]
; ccall printf,fmt1,dword[sum]
; ccall printf, fmt1, dword [hashtable.elements]
initvector sortedtable.data,sortedtable.size,10000,24
mov ebx,10000
imul ebx,6
dwordnset dword[sortedtable.data],0,ebx
sort_frequencies len
print_strs dword[sortedtable.data],len
; ccall printf,fmt1,dword[sortedtable.elements]
mov dword[sortedtable.elements],0
mov dword [hashtable.elements],0
freevector dword[hashtable.data],SIZE
ccall printf,nl
}
write_frequencies 1
write_frequencies 2

macro write_count len,str
{
mov dword[sum],0
mov dword[cnt],0
calc_frequencies len
; ccall printf, fmt1, dword [hashtable.elements]
mov esi,dword[hashtable.data]
add esi,0x80000
merge dword[hashtable.data],esi,SIZE,len
; ccall printf,fmt1,dword[cnt]
; ccall printf,fmt1,dword[sum]
pack_str lngbuf,str,len,0
hashfind dword[hashtable.data],hashtable.elements,8,lngbuf,edx
push eax
unpack_str str,dword[eax+4],len
pop eax
ccall printf, fmt5, dword [eax],str
; ccall printf, fmt1, dword [hashtable.elements]
mov dword [hashtable.elements],0
freevector dword[hashtable.data],SIZE
}
write_count 3,lngstr4
write_count 4,lngstr3
write_count 6,lngstr2
write_count 12,lngstr1
write_count 18,lngstr

xor eax,eax
ret
e2:
ccall perror, err1
sys_exit -1

section '.data' writeable

align 4
fmt db "%s %.3f",0xa,0
fmt1 db "%u",0xa,0
fmt2 db "%p",0xa,0
fmt3 db "%c",0xa,0
fmt4 db "%s %u %u",0xa,0
fmt5 db "%u",9,"%s",0xa,0
fmt6 db "%u %u",0xa,0
err1 db "realloc failed",0
err2 db "index error",0
lngstr db "gGtattTtaatttatagt",0
lngstr1 db "ggtATTttaatt",0
lngstr2 db "gGtAtt",0
lngstr3 db "gGta",0
lngstr4 db "GgT",0
three db ">THREE"
nl db 0xa,0

align 4
fptr dd 0
fend dd 0
dta dd 0
sdta dd 0
hashtable vector 0,0
sortedtable vector 0,0
align 4
sema dd 0

section '.bss' writeable

align 4
buf rb 256
align 4
filebuf rb fsize
align 4
xtbl rb 256
align 4
lngbuf rb 128
align 4
bstck rb 4095
tstck rb 1
align 4
bstck1 rb 4095
tstck1 rb 1
sum rd 1
cnt rd 1


--
http://maxa.homedns.org/

Sometimes online sometimes not


From: Branimir Maksimovic on
On Sat, 20 Mar 2010 01:35:52 +0100
Branimir Maksimovic <bmaxa(a)hotmail.com> wrote:

Interesting thing,when I used 4 threads and reduced size of hashetable
by double, I got more speed increase even I have only two cpus
(probably because of cache?);0)

bmaxa(a)maxa:~/fasm/knucleotide$ time taskset -c 0,1 ./knucleotide <~/long-input.txt
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758 GGT
446535 GGTA
47336 GGTATT
893 GGTATTTTAATT
893 GGTATTTTAATTTATAGT

real 0m9.635s
user 0m16.570s
sys 0m0.130s
bmaxa(a)maxa:~/fasm/knucleotide$ time taskset -c 0,1 ./knucleotidecpp <~/long-input.txt
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758 GGT
446535 GGTA
47336 GGTATT
893 GGTATTTTAATT
893 GGTATTTTAATTTATAGT

real 0m10.411s
user 0m19.850s
sys 0m0.100s

Greets

--
http://maxa.homedns.org/

Sometimes online sometimes not


From: Branimir Maksimovic on
On Sat, 20 Mar 2010 20:35:17 +0100
Branimir Maksimovic <bmaxa(a)hotmail.com> wrote:

Polished it a bit, sped up loading of file (hint length),
more optimisations regarding caching previous strings (up to 19
chars length),
but slowed it down a bit, in order to support
64 char nucleotides.

Result:

cpp:

bmaxa(a)maxa:~/fasm/knucleotide$ time taskset -c 0,1 ./knucleotidecpp <~/long-input.txt
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758 GGT
446535 GGTA
47336 GGTATT
893 GGTATTTTAATT
893 GGTATTTTAATTTATAGT

real 0m10.145s
user 0m19.390s
sys 0m0.170s

asm:

bmaxa(a)maxa:~/fasm/knucleotide$ time taskset -c 0,1 ./knucleotide <~/long-input.txt
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758 GGT
446535 GGTA
47336 GGTATT
893 GGTATTTTAATT
893 GGTATTTTAATTTATAGT

real 0m8.168s
user 0m15.010s
sys 0m0.130s

Finished source ;)

No more postings...on this program ;)

Greets

struc vector d,s
{
.data dd d
.size dd s
.elements dd 0
}

macro ccall proc,[arg] ; call CDECL procedure
{
common
local size
size = 0
reverse
pushd arg
size = size+4
common
call proc
add esp,size
}

macro sys_exit rc
{
mov eax,1 ; exit
mov ebx,rc
int 0x80
}

macro sys_read fd, buf, size
{
mov eax, 3 ; sys_read
mov ebx, fd
mov ecx, buf
mov edx, size
int 0x80
}
macro sys_write fd, buf, size
{
mov eax, 4 ; sys_write
mov ebx, fd
mov ecx, buf
mov edx, size
int 0x80
}

CLONE_VM equ 0x00000100
CLONE_FS equ 0x00000200
CLONE_FILES equ 0x00000400
CLONE_SIGHAND equ 0x00000800
CLONE_THREAD equ 0x00010000

macro sys_clone stack
{
mov eax,120 ; sys_clone
mov ebx,CLONE_VM or CLONE_FS or CLONE_FILES \
or CLONE_SIGHAND;
mov ecx,stack ; choose stack
xor edx,edx ; no struct
int 0x80
}

__WNOTHREAD equ 0x20000000
__WALL equ 0x40000000
__WCLONE equ 0x80000000

macro sys_wait pid
{
mov ebx,pid
mov eax,114 ; sys_wait4
xor ecx,ecx
xor esi,esi
mov edx,__WALL;
int 0x80
}

macro read fd, buf,size
{
local l1,l2,l3
mov edi,buf
mov ebx,size
xor ecx,ecx
mov eax, dword [fptr]
and eax,eax
jnz l2
l1:
push ebx ecx edx edi
strncpy fileresbuf,filebuf,fsize,0
sys_read fd,filebuf,fsize
pop edi edx ecx ebx
and eax,eax
jz l3
lea eax, [eax+filebuf]
mov dword [fend], eax
mov dword [fptr], filebuf
l2:
mov eax, dword [fend]
sub eax, dword [fptr]
jz l1
cmp eax,ebx
cmovg eax,ebx
sub ebx, eax
add ecx,eax
push ecx
strncpy edi,dword [fptr], eax, 0
pop ecx
and ebx,ebx
mov dword [fptr],esi
jnz l1
l3:
mov eax,ecx
}

macro back size
{
sub dword[fptr],size
}

macro getLine fd, buf, size, hint
{
local l1,l2,l3
mov ecx, size
mov ebx,hint
cmp ecx,ebx
cmovl ebx,ecx
xor edx,edx
mov edi, buf
l1:
cmp ecx,0
jle l2
push ebx ecx edx edi
read fd,dword[esp],ebx
pop edi edx ecx ebx
add edx,eax
test eax,eax
jz l2;
sub ecx,eax
push ecx eax
mov ecx,eax
strnchr edi,0xa,ecx
pop eax ecx
cmp byte [edi-1], 0xa
jne l1
dec edi
l2:
mov byte [edi],0
sub edi,buf
mov eax,edx
dec eax
sub eax,edi
jle l3
back eax
l3:
}

macro strnchr s,c,count
{
mov edi,s
mov eax,c
mov ecx,count
cld
repne scasb
}

macro dwordnset s, c, count
{
mov edi,s
mov eax,c
mov ecx,count
cld
rep stosd
}

macro strnset s,c, size
{
mov edi,s
mov eax,c
mov ecx,size
cld
rep stosb
}

macro dwordncmp s1, s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx,size
repe cmpsd
if dir
cld
end if
}

macro strncmp s1, s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx,size
repe cmpsb
if dir
cld
end if
}

macro dwordncpy s1,s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx, size
rep movsd
if dir
cld
end if
}

macro strncpy s1,s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx, size
rep movsb
if dir
cld
end if
}

macro to_num src
{
mov al,src
xlatb
}

macro to_char src
{
mov al,src
xlatb
}

macro pack_str dst,src,size,f
{
if ~f
local l1
mov esi,src
mov edi,dst
mov ecx,size
mov edx,0
mov ebx,xtbl
l1:
to_num byte [esi]
mov byte [edi], al
inc edi
inc esi
inc edx
dec ecx
jnz l1
else
strncpy dst,src,size,0
end if
}

macro really_pack_str dst,src,size,f
{
local l1,l2,e1
mov esi,src
mov edi,dst
mov ecx,size
mov edx,1
l1:
mov ebx,4
mov byte [edi],0
l2:
push ebx
mov ebx,xtbl
to_num byte [esi]
pop ebx
shl byte [edi],2
or byte [edi], al
inc esi
dec ecx
jz e1
dec ebx
jnz l2
inc edi
inc edx ; count
jmp l1
e1:
}

macro unpack_str dst,src,size
{
local l1
mov esi,src
mov edi,dst
mov ecx,size
mov ebx,xtbl
l1:
to_char byte [esi]
mov byte [edi], al
inc edi
inc esi
dec ecx
jnz l1
}

macro initvector data,oldsize,size,block
{
local e1,e2
mov eax, size
imul eax, block
push eax
ccall realloc,dword[data],eax
pop ebx
and eax,eax
jz e1
mov dword[data],eax
mov dword[oldsize],ebx
jmp e2
e1:
ccall perror, err1
sys_exit -1
e2:
}

macro freevector data,size
{
local l1
mov ecx,size
mov ebx,data
l1:
push ebx ecx
ccall free,dword[ebx]
pop ecx ebx
mov dword[ebx],0
add ebx,4
dec ecx
jnz l1
}

macro hash str,size
{
local l1,s1,s2,e1,e2
mov ecx,size
mov ebx,str
mov edi,4
mov esi,16
xor eax,eax

cmp ecx,4
jle s1
pxor xmm2,xmm2
pcmpeqb xmm2,xmm1
pmovmskb edx,xmm2
xor dx,0xffff
je l1
cmp ecx,16
jg s2
sub ecx,4
add ebx,ecx
shl ecx,1
sub ecx,64
neg ecx
movd xmm2,ecx
psllq xmm1,xmm2
psrlq xmm1,xmm2
movd eax,xmm1
mov ecx,4
s1:
pxor xmm1,xmm1
jmp l1
s2:
movd edx,xmm1
psrldq xmm1,4
movd eax,xmm1
cmp ecx,20
jge s1
sub ecx,4
add ebx,ecx
sub ecx,12
mov edi,4
sub edi,ecx
mov esi,edi
shl ecx,1
mov edi,ecx
sub ecx,32
neg ecx
shl edx,cl
shld eax,edx,8
sub edi,8
neg edi
mov ecx,edi
shr eax,cl
mov ecx,4
mov edi,4
jmp s1
l1:
shl eax,2
movzx edx,byte[ebx]
or eax,edx
dec ecx
jle e1
inc ebx
dec esi
jnz l1
pslldq xmm1,4
movdqa xmm2,xmm1
movd xmm1,eax
por xmm1,xmm2
xor eax,eax
mov esi,16
dec edi
jz e2
jmp l1
e1:
pslldq xmm1,4
movdqa xmm2,xmm1
movd xmm1,eax
por xmm1,xmm2
dec edi
e2:
}

macro hashfind data,elements,block,srchstr,srchlen
{
mov eax,srchstr
movd xmm0,eax
hash srchstr,srchlen
mov ebx,data
strfind elements,block
}

macro strfind elements,block
{
local l1,l2,l3,l4,l5,s1,s2,s3,e1
movdqa xmm2,xmm1
mov edx,4
sub edx,edi
xor eax,eax
s2:
movd ecx,xmm2
shld eax,ecx,16
and ecx,0xffff
add eax,ecx
psrldq xmm2,4
dec edx
jnz s2
s3:
and eax,0xffff
xor esi,esi
shl eax,2
movd xmm2,eax
movd xmm3,ebx
cmp dword[ebx+eax],0
jne l3
l1:
; allocate
s1:
mov ebx,1
xor eax,eax
lock cmpxchg dword[sema],ebx ; test and set
and eax,eax
jnz s1
add esi,28
movd eax,xmm2
movd ebx,xmm3
ccall realloc,dword[ebx+eax],esi ; realloc is not thread safe
lock and dword[sema],0 ; reset
mov esi,eax
and esi,esi
jz e2
movd eax,xmm2
movd ebx,xmm3
cmp dword[ebx+eax],0
mov dword[ebx+eax],esi
jne l2
mov esi, dword[ebx+eax]
mov dword[esi],0
l2:
mov ebx,dword[ebx+eax]
add ebx,4
mov eax,dword[ebx-4]
imul eax,24
mov dword[ebx+eax],0
movd [ebx+eax+4],xmm0
movdqu [ebx+eax+8],xmm1
inc dword[elements]
inc dword[ebx-4]
jmp e1
;search
l3:
mov ebx,dword[ebx+eax]
add ebx,4
xor eax,eax

l4:
mov esi,dword[ebx-4]
imul esi,24
cmp eax,esi
jge l1 ; we need to reallocate
movdqu xmm4,[ebx+eax+8]
pcmpeqb xmm4,xmm1
pmovmskb esi,xmm4
xor si,0xffff
jz e1

l5:
add eax,24
jmp l4
e1:
lea eax,[ebx+eax]
}

macro find data,elements,block,srchstr
; binary search and insert
{

local l1,l2,e1,e2,e3
pushd data
mov ecx,srchstr
mov ebx,dword[esp]
add ebx,4
mov eax,dword[ebx-4]
lea edx,[ebx+eax*block]
l1:

if 0
pusha
ccall printf,fmt1,dword[ebx-4]
popa
end if

and eax,eax
jz e1
cmp edx,ebx
jle e1
shr eax,1
mov esi,dword[ecx]
cmp dword[ebx+eax*block],esi
jle l1
and eax,eax
jnz l2
inc eax
l2:
lea ebx, [ebx+eax*block]
jmp l1
e1:
mov eax,dword[esp]
add eax,4
lea edx,[eax-4]
mov edx,dword[edx]
lea eax, [eax+edx*block]
mov edx, eax
add edx,block
dec edx
sub eax,ebx
jl e2
push ecx
lea ecx, [edx-block]

if 0
pusha
ccall printf,fmt5,eax,dword[ecx]
popa
end if

strncpy edx,ecx,eax,1
pop ecx

if 0
pusha
ccall printf,fmt5,eax,ecx
popa
end if

mov esi,dword[ecx]
mov dword [ebx],esi ; count
mov esi,dword[ecx+4]
mov dword [ebx+4],esi ;ptrstring
mov eax,dword[esp]
inc dword[eax]
inc dword[elements]
xor eax,eax
jmp e3
e2:
ccall printf,fmt,err2
sys_exit -1
e3:
pop ecx
lea eax,[ebx+eax*block]
}


macro print_strs ptr,size
{
local l1,e1
emms
mov esi,size
mov ebx,ptr
mov ecx,dword[ebx]
cmp ecx,0
jz e1
mov edx,dword[sdta]
inc edx
sub edx,esi
push edx
fild dword[esp]
mov dword[esp],100
fild dword[esp]
add ebx,4
add esp,4
l1:
push ecx ebx esi
unpack_str lngbuf,dword[ebx+4],size
pop esi
mov byte[lngbuf+esi],0
pop ebx
push ebx
fild dword[ebx]
fmul st0,st1
fdiv st0,st2
sub esp,8
fstp qword[esp]
ccall printf,fmt, lngbuf
add esp,8
pop ebx ecx
add ebx,8
dec ecx
jnz l1
e1:
}

macro merge data,data1,size,srchlen
{
local l1,l2,l3,t1,e1
mov ecx,size/4
cmp ecx,0
jz e1
mov ebx,data1
l1:
cmp dword[ebx],0
jz l3
push ebx ecx
mov ebx,dword[ebx]
mov ecx,dword[ebx]

if 0
pusha
ccall printf,fmt6,ecx,dword[ebx+4]
popa
end if

add ebx,4
add dword[sum],ecx
inc dword[cnt]
l2:
push ebx ecx
pxor xmm1,xmm1
hashfind data,hashtable.elements,8,dword[ebx+4],srchlen
dec dword[hashtable.elements]

pop ecx ebx
mov esi, dword[ebx]
add dword[eax],esi
add ebx,24
dec ecx
jnz l2
pop ecx ebx
l3:
add ebx,4
dec ecx
jnz l1
e1:
}

macro frequencies size
{
local l1,l2,l3,e1
mov edx,size
mov ebx,dword[dta]
mov ecx,dword[sdta]
inc ecx
sub ecx,edx ; loop count
pushd dword[hashtable.data]
pushd dword[hashtable.elements]
push ebx ecx edx
strncpy tstck-5*4,esp,5*4,0
strncpy tstck1-5*4,esp,5*4,0
strncpy tstck2-5*4,esp,5*4,0
strncpy tstck3-5*4,esp,5*4,0

sub dword[tstck1-4*4],1
add dword[tstck1-3*4],1
add dword[tstck1-1*4],0x40000
sub dword[tstck2-4*4],2
add dword[tstck2-3*4],2
add dword[tstck2-1*4],0x80000
sub dword[tstck3-4*4],3
add dword[tstck3-3*4],3
add dword[tstck3-1*4],0xc0000

sys_clone tstck ;1
and eax,eax
jz l1
push eax
sys_clone tstck1;2
and eax,eax
jz l1
push eax
sys_clone tstck2;3
and eax,eax
jz l1
push eax
sys_clone tstck3;4
and eax,eax
jz l1
sys_wait eax ;1
pop eax
sys_wait eax ;2
pop eax
sys_wait eax ;3
pop eax
sys_wait eax ;4
pop edx ecx ebx esi esi
jmp e1
l1:
mov ebp,esp
sub esp,5*4

if 0
ccall printf,fmt1,ecx
ccall printf,fmt1,edx
ccall printf,fmt2,ebx
end if

l2:
pop edx ecx ebx
push ebp
mov ebp,esp
pxor xmm1,xmm1
l3:
; push ebx ecx edx
movd xmm5,edx
movd xmm6,ecx
movd xmm7,ebx
hashfind dword [ebp+8], ebp+4,8,ebx,edx
; pop edx ecx ebx
movd edx,xmm5
movd ecx,xmm6
movd ebx,xmm7
inc dword[eax]
add ebx,4 ; interleave
sub ecx,4
jg l3
mov esp,ebp
pop ebp
pop esi
lock add dword[hashtable.elements],esi
pop esi
sys_exit 0
e1:

}

macro frequencies1
{
local l1,l2,l3,e1
mov ecx,dword[hashtable.elements]
cmp ecx,0
jz e1
mov ebx,dword[hashtable.data]
l1:
cmp dword[ebx],0
jz l3
push ebx ecx
mov ebx,dword[ebx]
mov ecx,dword[ebx]

if 0
pusha
ccall printf,fmt6,ecx,dword[ebx+4]
popa
end if

add ebx,4
sub dword[esp],ecx ; was bug, ecx wasnt counted
; when raw had more than 1
l2:
push ebx ecx
find dword[sortedtable.data],sortedtable.elements,8,ebx
pop ecx ebx
add ebx,24
dec ecx
jnz l2
pop ecx ebx
and ecx,ecx ; no decrement here
jz e1
l3:
add ebx,4
jmp l1
e1:
}

macro mwrite_frequencies ; len
{
push ebp
mov ebp,esp
calc_frequencies dword[ebp+8]
mov esi,dword[hashtable.data]
add esi,0x40000
merge dword[hashtable.data],esi,SIZE,dword[ebp+8]
mov esi,dword[hashtable.data]
add esi,0x80000
merge dword[hashtable.data],esi,SIZE,dword[ebp+8]
mov esi,dword[hashtable.data]
add esi,0xc0000
merge dword[hashtable.data],esi,SIZE,dword[ebp+8]
; ccall printf,fmt1,dword[cnt]
; ccall printf,fmt1,dword[sum]
; ccall printf, fmt1, dword [hashtable.elements]
initvector sortedtable.data,sortedtable.size,dword[hashtable.elements],24
mov ebx,dword[hashtable.elements]
imul ebx,6
dwordnset dword[sortedtable.data],0,ebx
sort_frequencies dword[ebp+8]
print_strs dword[sortedtable.data],dword[ebp+8]
; ccall printf,fmt1,dword[sortedtable.elements]
mov dword[sortedtable.elements],0
mov dword [hashtable.elements],0
freevector dword[hashtable.data],SIZE
ccall printf,nl
mov esp,ebp
pop ebp
}

macro mwrite_count ; len,str
{
push ebp
mov ebp,esp
mov dword[sum],0
mov dword[cnt],0
calc_frequencies dword[ebp+12]
; ccall printf, fmt1, dword [hashtable.elements]
mov esi,dword[hashtable.data]
add esi,0x40000
merge dword[hashtable.data],esi,SIZE,dword[ebp+12]
mov esi,dword[hashtable.data]
add esi,0x80000
merge dword[hashtable.data],esi,SIZE,dword[ebp+12]
mov esi,dword[hashtable.data]
add esi,0xc0000
merge dword[hashtable.data],esi,SIZE,dword[ebp+12]
; ccall printf,fmt1,dword[cnt]
; ccall printf,fmt1,dword[sum]
pack_str lngbuf,dword[ebp+8],dword[ebp+12],0
pxor xmm1,xmm1
hashfind dword[hashtable.data],hashtable.elements,8,lngbuf,edx
push eax
unpack_str dword[ebp+8],dword[eax+4],dword[ebp+12]
pop eax
ccall printf, fmt5, dword [eax],dword[ebp+8]
; ccall printf, fmt1, dword [hashtable.elements]
mov dword [hashtable.elements],0
freevector dword[hashtable.data],SIZE
mov esp,ebp
pop ebp
}

macro calc_frequencies size
{
mov edx,size
call cfrequencies
}

macro sort_frequencies size
{
call sfrequencies
}

macro write_frequencies len
{
pushd len
call swrite_fruequencies
add esp,4
}

macro write_count len,str
{
pushd len str
call swrite_count
add esp,8
}

STDIN equ 0
STDOUT equ 1
STDERR equ 2
fsize equ 4096
SIZE equ 0x40000 ; play with size and mask in strfind
format ELF

section '.text' executable

public main
extrn printf
extrn perror
extrn realloc
extrn free

cfrequencies:
frequencies edx
ret
sfrequencies:
frequencies1
ret
swrite_fruequencies:
mwrite_frequencies
ret
swrite_count:
mwrite_count
ret
main:
strnset xtbl,'A',0
mov ebx ,xtbl
mov byte[ebx+'a'],0
mov byte[ebx+'A'],0
mov byte[ebx+'c'],1
mov byte[ebx+'C'],1
mov byte[ebx+'g'],2
mov byte[ebx+'G'],2
mov byte[ebx+'t'],3
mov byte[ebx+'T'],3

mov byte[ebx+0],'A'
mov byte[ebx+1],'C'
mov byte[ebx+2],'G'
mov byte[ebx+3],'T'

l2:
getLine STDIN, buf, 256, 61
movzx eax, byte [buf]
and eax,eax
jz e1
strncmp buf,three,6,0
and ecx,ecx
jnz l2
l1:
getLine STDIN, buf, 256, 61
movzx eax, byte [buf]
and eax,eax
jz e1
cmp eax,'>'
je e1
mov eax,edi
push eax
add eax, dword [sdta]
ccall realloc,dword[dta], eax
and eax,eax
jz e2
mov dword[dta],eax
pop eax
mov ebx, dword [sdta]
add ebx, dword [dta]
push eax
pack_str ebx,buf,eax,0
pop eax
add dword[sdta],edx
jmp l1
e1:
initvector hashtable.data,hashtable.size,SIZE,4
dwordnset dword[hashtable.data],0,SIZE

; sys_write STDOUT, dword [dta], dword[sdta]

; ccall printf,fmt1,dword[sdta]
cmp dword[sdta],0
je e3

write_frequencies 1
write_frequencies 2

write_count 3,lngstr4
write_count 4,lngstr3
write_count 6,lngstr2
write_count 12,lngstr1
; write_count 17,lngstr
write_count 18,lngstr
; write_count 19,lngstr
; write_count 64,lngstr
e3:
xor eax,eax
ret
e2:
ccall perror, err1
sys_exit -1

section '.data' writeable

align 4
fmt db "%s %.3f",0xa,0
fmt1 db "%u",0xa,0
fmt2 db "%p",0xa,0
fmt3 db "%c",0xa,0
fmt4 db "%s %u",0xa,0
fmt5 db "%u",9,"%s",0xa,0
fmt6 db "%u %u",0xa,0
fmt7 db "%x",0xa,0
err1 db "realloc failed",0
err2 db "index error",0
lngstr db "gGtattTtaatttatagt",0
lngstr1 db "ggtATTttaatt",0
lngstr2 db "gGtAtt",0
lngstr3 db "gGta",0
lngstr4 db "GgT",0
three db ">THREE"
nl db 0xa,0

align 4
fptr dd 0
fend dd 0
dta dd 0
sdta dd 0
hashtable vector 0,0
sortedtable vector 0,0
align 4
sema dd 0

section '.bss' writeable

align 4
buf rb 256
align 4
fileresbuf rb fsize
filebuf rb fsize
align 4
xtbl rb 256
align 4
lngbuf rb 128
align 4
bstck rb 4095
tstck rb 1
align 4
bstck1 rb 4095
tstck1 rb 1
align 4
bstck2 rb 4095
tstck2 rb 1
align 4
bstck3 rb 4095
tstck3 rb 1
sum rd 1
cnt rd 1


--
http://maxa.homedns.org/

Sometimes online sometimes not