I'm trying to learn some basic x86 assembly and so I've begun solving Project Euler problems. I was hoping for some critique of my code that, hopefully, includes either the efficiency of the operations or the readability / style of the code itself. I will provide the Makefile for Linux 64 bit.

The purpose of the code is to sum all numbers from [0, 1000) that are divisible by 3 or 5.

The code can be run using `make RUN=euler_1`.

NB:

I am aware that most compilers replace modulos of known numbers with some combination of `mov` and `shr` to avoid the integer division. For example, see [this thread][1].

**Makefile**

 .PHONY:	clean
 
 all:	$(RUN).elf
 	./$^
 
 %.elf:	%.o	
 	ld $^ -o $@ -lc -e main -dynamic-linker /lib64/ld-linux-x86-64.so.2
 
 %.o:	%.asm
 	nasm -f elf64 $^
 
 clean:
 	rm -f *.o *.elf


**euler_1.asm**

 extern printf
 global main
 
 section .data
 fmt: db "%d", 0x0a, 0
 
 section .text
 	
 ;; main - Calculate the sum of all numbers between [0, 1000) that are divisible
 ;; by 3 or 5.
 ;; 	sum : R8
 main:	
 	; sum = 0
 	mov	r8, 0	
 	; for i in [0, 1000) {
 	mov	rcx, 0
 for0:	
 	; if i % 3 == 0 or i % 5 == 0 {
 
 	; i % 3 == 0
 	mov	rax, rcx
 	mov	rdx, 0
 	mov	r9, 3
 	div	r9
 	test rdx, rdx
 	jne	if01
 	; sum = sum + i
 	add	r8, rcx
 	jmp	if0
 
 if01:
 	; i % 5 == 0
 	mov	rax, rcx
 	mov	rdx, 0
 	mov	r9, 5
 	div	r9
 	test rdx, rdx
 	jne	if0
 	; sum = sum + i
 	add	r8, rcx
 	jmp	if0
 	; }
 if0:
 	inc	rcx
 	cmp	rcx, 1000
 	jl	for0
 	; }
 	
 	; printf("%d", sum)
 	lea	rdi, [rel fmt]
 	mov	rsi, r8
 	mov	rax, 0
 	call printf
 	
 	; sys_exit(0)
 	mov	rdi, 0
 	mov	rax, 60
 	syscall



 [1]: https://stackoverflow.com/questions/8021772/assembly-language-how-to-do-modulo