From: Branimir Maksimovic on
For exercize I taken knucleotide benchmark.

http://shootout.alioth.debian.org/u32/performance.php?test=knucleotide

My naive assembler program (fasm):

struc vector d,s
{
.data dd d
.size dd s
.elements dd 0
}
macro ccall proc,[arg] ; call CDECL procedure
{
common
local size
size = 0
reverse
pushd arg
size = size+4
common
call proc
add esp,size
}

macro sys_exit rc
{
mov eax,1 ; exit
mov ebx,rc
int 0x80
}

macro sys_read fd, buf, size
{
mov eax, 3 ; sys_read
mov ebx, fd
mov ecx, buf
mov edx, size
int 0x80
}
macro sys_write fd, buf, size
{
mov eax, 4 ; sys_write
mov ebx, fd
mov ecx, buf
mov edx, size
int 0x80
}
macro read fd, buf,size
{
local l1,l2,l3
mov eax, dword [fptr]
and eax,eax
jnz l2
l1:
sys_read fd,filebuf,fsize
and eax,eax
jz l3
lea eax, [eax+filebuf]
mov dword [fend], eax
mov dword [fptr], filebuf
l2:
mov ecx, size
mov ebx, size
mov eax, dword [fend]
sub eax, dword [fptr]
jz l1
cmp eax,ecx
cmovl ecx,eax
mov eax,ecx
sub ebx, ecx
strncpy buf,dword [fptr], ecx, 0
and ebx,ebx
mov dword [fptr],esi
jnz l1
l3:
}

macro getLine fd, buf, size
{
local l1,l2
mov ecx, size
mov edi, buf
l1:
and ecx,ecx
jz l2
push ecx
push edi
read fd,dword[esp],1
pop edi
pop ecx
cmp eax,1
jne l2;
dec ecx
inc edi
cmp byte [edi-1], 0xa
jnz l1
dec edi
l2:
mov byte [edi],0
}

macro dwordnset s, c, count
{
mov edi,s
mov eax,c
mov ecx,count
cld
rep stosd
}

macro strnset s,c, size
{
mov edi,s
mov eax,c
mov ecx,size
cld
rep stosb
}

macro dwordncmp s1, s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx,size
repe cmpsd
if dir
cld
end if
}

macro strncmp s1, s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx,size
repe cmpsb
if dir
cld
end if
}

macro dwordncpy s1,s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx, size
rep movsd
if dir
cld
end if
}

macro strncpy s1,s2, size, dir
{
if ~ dir
cld
else
std
end if
mov esi,s2
mov edi,s1
mov ecx, size
rep movsb
if dir
cld
end if
}

macro to_num src
{
local l1,l2,l3,l4,e1
cmp src,'A'
je l1
cmp src,'a'
je l1
cmp src,'C'
je l2
cmp src,'c'
je l2
cmp src,'G'
je l3
cmp src,'g'
je l3
; cmp src,'T'
; je l4
; cmp src,'t'
jmp l4
l1:
mov al,0
jmp e1
l2:
mov al,1
jmp e1
l3:
mov al,2
jmp e1
l4:
mov al,3
e1:
}

macro to_char src
{
local l1,l2,l3,l4,e1
cmp src,0
je l1
cmp src,1
je l2
cmp src,2
je l3
; cmp src,3
jmp l4
l1:
mov al,'A'
jmp e1
l2:
mov al,'C'
jmp e1
l3:
mov al,'G'
jmp e1
l4:
mov al,'T'
e1:
}

macro pack_str dst,src,size
{
local l1
mov esi,src
mov edi,dst
mov ecx,size
l1:
to_num byte [esi]
mov byte [edi], al
inc edi
inc esi
dec ecx
jnz l1
}

macro unpack_str dst,src,size
{
local l1
mov esi,src
mov edi,dst
mov ecx,size
l1:
to_char byte [esi]
mov byte [edi], al
inc edi
inc esi
dec ecx
jnz l1
}

macro initvector data,oldsize,size,block
{
local e1,e2
mov eax, size
imul eax, block
push eax
ccall realloc,dword[data],eax
pop ebx
and eax,eax
jz e1
mov dword[data],eax
mov dword[oldsize],ebx
jmp e2
e1:
ccall perror, err1
sys_exit -1
e2:
}

macro hash str,size
{
local l1
mov ecx, size
mov eax,0
mov ebx,str
l1:
imul eax,eax,31
movzx edx,byte [ebx]
add eax, edx
inc ebx
dec ecx
jnz l1
}

macro hashfind data,elements,block,srchstr,srchlen
{
pushd srchstr
pushd srchlen
hash srchstr,srchlen
mov ebx,data
and eax,0x3ffff
shl eax,5
strfind elements,block
; lea eax,[ebx+eax*block]
}

macro strfind elements,block
{
local l1,l2,e1
pop ecx ; len
pop edx ; s
l1:
cmp dword[ebx+eax],0
jne l2
mov dword[ebx+eax],edx
mov dword[ebx+eax+4],0
inc dword[elements]
jmp e1
l2:
push ecx
strncmp dword[ebx+eax],edx,ecx,0
pop ecx
je e1
add eax,block
jmp l1
e1:
lea eax,[ebx+eax]
}

macro find data,elements,block,srchstr,srchlen
; binary search and insert
{
local l1,l2,e1,e2,e3
mov ecx,srchstr
mov eax,dword[elements]
mov ebx,data
lea edx,[ebx+eax*block]
l1:
and eax,eax
jz e1
cmp edx,ebx
jle e1
shr eax,1
push ecx
strncmp dword[ebx+eax*block],ecx,srchlen,0
pop ecx
jl l1
je e3
and eax,eax
jnz l2
inc eax
l2:
lea ebx, [ebx+eax*block]
jmp l1
e1:
mov eax,data
mov edx,dword[elements]
lea eax, [eax+edx*block]
mov edx, eax
add edx,block
dec edx
sub eax,ebx
jl e2
push ecx
lea ecx, [edx-block]

if 0
pusha
ccall printf,fmt5,eax,dword[ecx]
popa
end if

strncpy edx,ecx,eax,1
pop ecx

if 0
pusha
ccall printf,fmt5,eax,ecx
popa
end if

mov dword [ebx],ecx
mov dword [ebx+4],0 ; heh
inc dword[elements]
xor eax,eax
jmp e3
e2:
ccall printf,fmt,err2
sys_exit -1
e3:
lea eax,[ebx+eax*block]
}

macro calc_frequencies size
{
local l1
mov ecx,dword [sdta]
inc ecx
sub ecx,size
xor eax,eax
l1:
push ecx ; end
push eax ; counter
mov ebx,dword [dta]
add ebx,eax
hashfind dword [hashtable.data], hashtable.elements,8,ebx,size
; find dword [hashtable.data], hashtable.elements,8,ebx,size
inc dword[eax+4]
pop eax
pop ecx
inc eax
cmp eax,ecx
jne l1
}

macro print_strs ptr,cnt
{
local l1,l2,e1
mov ebx,ptr
mov ecx,cnt
cmp dword[ebx],0
je l2
l1:
push ebx
push ecx
ccall printf,fmt, dword [ebx]
pop ecx
pop ebx
l2:
dec ecx
jz e1
add ebx,4
cmp dword[ebx],0
je l2
jmp l1
e1:
}

STDIN equ 0
STDOUT equ 1
STDERR equ 2
fsize equ 16384
format ELF
SIZE equ 2097152

section '.text' executable

public main
extrn printf
extrn perror
extrn realloc
extrn free

main:
getLine STDIN, buf, 256
movzx eax, byte [buf]
and eax,eax
jz e1
strncmp buf,three,6,0
and ecx,ecx
jnz main
l1:
getLine STDIN, buf, 256
movzx eax, byte [buf]
and eax,eax
jz e1
cmp eax,'>'
je e1
mov eax,256
sub eax,ecx
dec eax
push eax
add eax, dword [sdta]
ccall realloc,dword[dta], eax
and eax,eax
jz e2
mov dword[dta],eax
pop eax
mov ebx, dword [sdta]
add ebx, dword [dta]
push eax
pack_str ebx,buf,eax
pop eax
add dword[sdta],eax
jmp l1
e1:
initvector hashtable.data,hashtable.size,SIZE,4
dwordnset dword[hashtable.data],0,SIZE
; dwordnset dword[hashtable.data],rstm,100
; find dword [hashtable.data],hashtable.elements,8,msg,1
; find dword [hashtable.data],hashtable.elements,8,msg7,1
; find dword [hashtable.data],hashtable.elements,8,msg1,1
; find dword [hashtable.data],hashtable.elements,8,msg2,1
; find dword [hashtable.data],hashtable.elements,8,msg3,1
; find dword [hashtable.data],hashtable.elements,8,msg4,1
; find dword [hashtable.data],hashtable.elements,8,msg5,1
; find dword [hashtable.data],hashtable.elements,8,msg6,1
; find dword [hashtable.data],hashtable.elements,8,msg,1
; find dword [hashtable.data],hashtable.elements,8,msg4,1
calc_frequencies 1
mov dword [hashtable.elements],0
dwordnset dword[hashtable.data],0,SIZE
calc_frequencies 2
mov dword [hashtable.elements],0
dwordnset dword[hashtable.data],0,SIZE

calc_frequencies 3
pack_str lngbuf,lngstr4,3
hashfind dword [hashtable.data],hashtable.elements,8,lngbuf,3
ccall printf, fmt1, dword [eax+4]
ccall printf, fmt1, dword [hashtable.elements]
mov dword [hashtable.elements],0
dwordnset dword[hashtable.data],0,SIZE

calc_frequencies 4
pack_str lngbuf,lngstr3,4
hashfind dword [hashtable.data],hashtable.elements,8,lngbuf,4
ccall printf, fmt1, dword [eax+4]
ccall printf, fmt1, dword [hashtable.elements]
mov dword [hashtable.elements],0
dwordnset dword[hashtable.data],0,SIZE

calc_frequencies 6
pack_str lngbuf,lngstr2,6
hashfind dword [hashtable.data],hashtable.elements,8,lngbuf,6
ccall printf, fmt1, dword [eax+4]
ccall printf, fmt1, dword [hashtable.elements]
mov dword [hashtable.elements],0
dwordnset dword[hashtable.data],0,SIZE

calc_frequencies 12
pack_str lngbuf,lngstr1,12
hashfind dword [hashtable.data],hashtable.elements,8,lngbuf,12
ccall printf, fmt1, dword [eax+4]
ccall printf, fmt1, dword [hashtable.elements]
mov dword [hashtable.elements],0
dwordnset dword[hashtable.data],0,SIZE

calc_frequencies 18
mov eax,dword[hashtable.data]
pack_str lngbuf,lngstr,18
hashfind dword [hashtable.data],hashtable.elements,8,lngbuf,18
ccall printf, fmt1, dword [eax+4]
; print_strs eax,20
ccall printf, fmt1, dword [hashtable.elements]
ccall printf, fmt1, dword [sdta]
; sys_write STDOUT, dword [dta], dword[sdta]
xor eax,eax
ret
e2:
ccall perror, err1
sys_exit -1

section '.data' writeable

align 4
fmt db "%10s",0xa,0
fmt1 db "%u",0xa,0
fmt2 db "%p",0xa,0
fmt3 db "%c",0xa,0
fmt4 db "%s %u %u",0xa,0
fmt5 db "%u %s",0xa,0
err1 db "realloc failed",0
err2 db "index error",0
msg db "a",0
msg1 db "b",0
msg2 db "c",0
msg3 db "d",0
msg4 db "e",0
msg5 db "f",0
msg6 db "g",0
msg7 db "h",0
lngstr db "ggtattttaatttatagt",0
lngstr1 db "GGTATTTTAATT",0
lngstr2 db "GGTATT",0
lngstr3 db "GGTA",0
lngstr4 db "GGT",0
three db ">THREE"

align 4
fptr dd 0
fend dd 0
dta dd 0
sdta dd 0
hashtable vector 0,0

section '.bss' writeable
align 4
buf rb 256
align 4
filebuf rb fsize
align 4
lngbuf rb 18

I didn't complete it yet (need to write sort routine, but that consumes
just few ticks)
but here is speed:

bmaxa(a)maxa:~/fasm/knucleotide$ cat start.sh
fasm -m 32768 knucleotide.asm
gcc -m32 knucleotide.o -o knucleotide
strip knucleotide

bmaxa(a)maxa:~/fasm/knucleotide$
bmaxa(a)maxa:~/fasm/knucleotide$ ./start.sh
flat assembler version 1.68 (32768 kilobytes memory)
4 passes, 4514 bytes.
bmaxa(a)maxa:~/fasm/knucleotide$ time ./knucleotide < ~/long-input.txt
1471758
64
446535
256
47336
4096
893
138127
893
139882
125000000

real 0m32.982s
user 0m32.820s
sys 0m0.070s
second place c++ program time:
bmaxa(a)maxa:~/fasm/knucleotide$ time ./knucleotidecpp < ~/long-input.txt
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758 GGT
446535 GGTA
47336 GGTATT
893 GGTATTTTAATT
893 GGTATTTTAATTTATAGT

real 0m12.301s
user 0m22.840s
sys 0m0.200s

22 seconds, which is faster (multithreaded 12 secs), but Im faster
then C and java with most naive approach ;)

but consumes 360 mb virt/144 res on my machine

All in all, writing code assembler is pretty fast,
as Im not versed in it (yet), but have experience
in 90ies ;)

Greets
From: Branimir Maksimovic on
Branimir Maksimovic wrote:
>
> but consumes 360 mb virt/144 res on my machine
yes c++ program is faster but consumes more memory
than asm. I wasn't clear ;)

>
> All in all, writing code assembler is pretty fast,
> as Im not versed in it (yet), but have experience
> in 90ies ;)
>
> Greets
From: James Harris on
On 25 Feb, 12:12, Branimir Maksimovic <bm...(a)hotmail.com> wrote:
> Branimir Maksimovic wrote:
>
> > but consumes 360 mb virt/144 res on my machine
>
> yes c++ program is faster but consumes more memory
> than asm. I wasn't clear ;)
>
>
>
> > All in all, writing code assembler is pretty fast,
> > as Im not versed in it (yet), but have experience
> > in 90ies ;)

You may get some replies if you post a *much* smaller query. Your
initial post was enormous!
From: Branimir Maksimovic on
James Harris wrote:
> On 25 Feb, 12:12, Branimir Maksimovic <bm...(a)hotmail.com> wrote:
>> Branimir Maksimovic wrote:
>>
>>> but consumes 360 mb virt/144 res on my machine
>> yes c++ program is faster but consumes more memory
>> than asm. I wasn't clear ;)
>>
>>
>>
>>> All in all, writing code assembler is pretty fast,
>>> as Im not versed in it (yet), but have experience
>>> in 90ies ;)
>
> You may get some replies if you post a *much* smaller query. Your
> initial post was enormous!

He, yes, but I wanted to post complete source so anyone can compile.
Perhaps someone will say it's crappy or ok ;)
I didn't apply any special optimisation technique regarding hardware and
got decent speed ;)

Greets
From: Bobbias on
On Feb 25, 8:53 pm, Branimir Maksimovic <bm...(a)hotmail.com> wrote:
> James Harris wrote:
> > On 25 Feb, 12:12, Branimir Maksimovic <bm...(a)hotmail.com> wrote:
> >> Branimir Maksimovic wrote:
>
> >>> but consumes 360 mb virt/144 res on my machine
> >> yes c++ program is faster but consumes more memory
> >> than asm. I wasn't clear ;)
>
> >>> All in all, writing code assembler is pretty fast,
> >>> as Im not versed in it (yet), but have experience
> >>> in 90ies ;)
>
> > You may get some replies if you post a *much* smaller query. Your
> > initial post was enormous!
>
> He, yes, but I wanted to post complete source so anyone can compile.
> Perhaps someone will say it's crappy or ok ;)
> I didn't apply any special optimisation technique regarding hardware and
> got decent speed ;)
>
> Greets- Hide quoted text -
>
> - Show quoted text -

It would have been a better idea to upload the source to something
like http://pastebin.com/ and link to the page there, giving those who
are interested, the ability to read the source, and those who aren't
interested, the chance to avoid scrolling for 5 minutes to get to
another post, lol.