|  |     Here's my entry. It's really the same algorithm as in .1, but the loops
    are unrolled and it uses a bitvector to remember which cards were
    present. It is also reusable (.1 is not, because it does not
    reinitialise the TEMP byte vector)
    
    Andrew
    
	.title	sort13
	.psect	sort13,rd,exe,nowrt,page
; REQUIRE FILES
	$ssdef
; MACRO DEFINITIONS
; macro to generate an instruction for scanning the input array
.macro	scan_in	?l1
bbss	(r0)+,(sp),l1
l1:
.endm
; macro to generate instructions for scanning the output array
.macro	scan_out x,?l2
.iif lt, x-32,	bbc	#x,r2,l2
.iif ge, x-32,	bbc	#x-32,r3,l2
	movl	#x,(r1)+
l2:
.endm
; DATA DEFINITIONS
; data inserted to make the input scan instructions be aligned
	.blkb	1
;++
;
; FUNCTIONAL DESCRIPTION:
;
;	Sorts an array of 13 integers, each integer is in the
;	range 1 to 52, with no duplicates
;	This routine is reusable and reentrant
;
; FORMAL PARAMETERS:
;
;	sortarray - an integer array, read and written
;		    the address of the first longword in the array
;		    is passed
;
; IMPLICIT INPUTS:
;
;	none
;
; IMPLICIT OUTPUTS:
;
;	none
;
; ROUTINE VALUE:
;
;	none
;
; COMPLETION CODES:
;
;	ss$_normal
;
; SIDE EFFECTS:
;
;	none
;
;--
	.entry	sort13, ^m<r2,r3>
	movl	4(ap), r0		; address of array
	movl	r0, r1			; copy for second loop
; initialise bitvector (pity we cant do this in registers)
	clrq	-(sp)
; scan the input array
; note that these instructions are aligned on longword boundaries
; in case it makes it go faster !
; this is not done for the output loop, as each set of instructions
; is 7 bytes long, putting an extra NOOP in would probably slow it
; down more in the cases the branch is not taken than would be gained
; if the branch is taken
.repeat	13
	scan_in
.endr
; get bitvector into registers
	movq	(sp)+,r2
; scan the bit vector, creating the output array
bitno=1
.repeat	52
	scan_out bitno
bitno=bitno+1
.endr
; set up status code
	movl	#ss$_normal,r0
	ret
	.end
    
 | 
|  |     re .-1
    Actually you could reduce the number of taken branches when scanning
    the bitvector for a further speedup by threading together some code
    like this, where r2,r3 are hold the bitvector.  You basically fall thru
    till you hit another card, when you branch to store it and fall thru
    some more...
100$:
        bbs	#0,r2,101$
        bbs	#1,r2,102$
	bbs	#2,r2,103$
	...
	bbs	#52-31,r3,151$
	bbs	#52-32,r3,152$
	ret
101$:	movzbl	#1,(r1)+
	bbs	#1,r2,102$
	bbs	#2,r2,103$
	...
	bbs	#52-31,r3,151$
	bbs	#52-32,r3,152$
	ret
102$:	movzbl	#2,(r1)+
	bbs	#2,r2,103$
	bbs	#3,r2,104$
	...
	bbs	#52-31,r3,151$
	bbs	#52-32,r3,152$
	ret
151$:	movzbl	#51,(r1)+
	bbs	#52-32,r3,152$
	ret
	...
152$:	movzbl	#52,(r1)+
	ret
    - Jim
 | 
|  | 	.title	sort13
deck_size=	52
hand_size=	13
	.psect	data, long
c.1:	.long	1
c.2:	.long	2
c.3:	.long	3
c.4:	.long	4
c.5:	.long	5
c.6:	.long	6
c.7:	.long	7
c.8:	.long	8
c.9:	.long	9
c.10:	.long	10
c.11:	.long	11
c.12:	.long	12
c.13:	.long	13
c.14:	.long	14
c.15:	.long	15
c.16:	.long	16
c.17:	.long	17
c.18:	.long	18
c.19:	.long	19
c.20:	.long	20
c.21:	.long	21
c.22:	.long	22
c.23:	.long	23
c.24:	.long	24
c.25:	.long	25
c.26:	.long	26
c.27:	.long	27
c.28:	.long	28
c.29:	.long	29
c.30:	.long	30
c.31:	.long	31
c.32:	.long	32
c.33:	.long	33
c.34:	.long	34
c.35:	.long	35
c.36:	.long	36
c.37:	.long	37
c.38:	.long	38
c.39:	.long	39
c.40:	.long	40
c.41:	.long	41
c.42:	.long	42
c.43:	.long	43
c.44:	.long	44
c.45:	.long	45
c.46:	.long	46
c.47:	.long	47
c.48:	.long	48
c.49:	.long	49
c.50:	.long	50
c.51:	.long	51
c.52:	.long	52
temp:	.blkw	deck_size+1
	.psect	code
	.entry	sort13, ^m<r2, r3, r4, r5, r6>
	movc5	#0, (r0), #0, #<deck_size+1>*2, temp
	movl	4(ap), r0
	.repeat	hand_size
	movl	(r0)+, r2
	movw	#4, temp[r2]
	.endr
	movaw	temp+2, r6
	movl	4(ap), r3
	movc	(r6)+, c.1, (r3)+
	movc	(r6)+, c.2, (r3)+
	movc	(r6)+, c.3, (r3)+
	movc	(r6)+, c.4, (r3)+
	movc	(r6)+, c.5, (r3)+
	movc	(r6)+, c.6, (r3)+
	movc	(r6)+, c.7, (r3)+
	movc	(r6)+, c.8, (r3)+
	movc	(r6)+, c.9, (r3)+
	movc	(r6)+, c.10, (r3)+
	movc	(r6)+, c.11, (r3)+
	movc	(r6)+, c.12, (r3)+
	movc	(r6)+, c.13, (r3)+
	movc	(r6)+, c.14, (r3)+
	movc	(r6)+, c.15, (r3)+
	movc	(r6)+, c.16, (r3)+
	movc	(r6)+, c.17, (r3)+
	movc	(r6)+, c.18, (r3)+
	movc	(r6)+, c.19, (r3)+
	movc	(r6)+, c.20, (r3)+
	movc	(r6)+, c.21, (r3)+
	movc	(r6)+, c.22, (r3)+
	movc	(r6)+, c.23, (r3)+
	movc	(r6)+, c.24, (r3)+
	movc	(r6)+, c.25, (r3)+
	movc	(r6)+, c.26, (r3)+
	movc	(r6)+, c.27, (r3)+
	movc	(r6)+, c.28, (r3)+
	movc	(r6)+, c.29, (r3)+
	movc	(r6)+, c.30, (r3)+
	movc	(r6)+, c.31, (r3)+
	movc	(r6)+, c.32, (r3)+
	movc	(r6)+, c.33, (r3)+
	movc	(r6)+, c.34, (r3)+
	movc	(r6)+, c.35, (r3)+
	movc	(r6)+, c.36, (r3)+
	movc	(r6)+, c.37, (r3)+
	movc	(r6)+, c.38, (r3)+
	movc	(r6)+, c.39, (r3)+
	movc	(r6)+, c.40, (r3)+
	movc	(r6)+, c.41, (r3)+
	movc	(r6)+, c.42, (r3)+
	movc	(r6)+, c.43, (r3)+
	movc	(r6)+, c.44, (r3)+
	movc	(r6)+, c.45, (r3)+
	movc	(r6)+, c.46, (r3)+
	movc	(r6)+, c.47, (r3)+
	movc	(r6)+, c.48, (r3)+
	movc	(r6)+, c.49, (r3)+
	movc	(r6)+, c.50, (r3)+
	movc	(r6)+, c.51, (r3)+
	movc	(r6)+, c.52, (r3)+
	ret
	.end
 | 
|  | 	.title	sort13
deck_size=	52
hand_size=	13
sort:	.blkl	deck_size+1
s.siz=	.- sort
temp:	.blkl	hand_size+1
	.entry	sort13, ^m<r2, r3, r4, r5>
	movc3	#hand_size*4, @4(ap), temp+4
	movc5	#0, (r0), #0, #s.siz, sort
	movl	4(ap), r0
;
;	movl	#1, r1
;10$:	movl	(r0)+, r2
;	movl	r1, sort[r2]
;	aobleq	#13, r1, 10$
;
	movl	(r0)+, r2
	movl	#1, sort[r2]
	movl	(r0)+, r2
	movl	#2, sort[r2]
	movl	(r0)+, r2
	movl	#3, sort[r2]
	movl	(r0)+, r2
	movl	#4, sort[r2]
	movl	(r0)+, r2
	movl	#5, sort[r2]
	movl	(r0)+, r2
	movl	#6, sort[r2]
	movl	(r0)+, r2
	movl	#7, sort[r2]
	movl	(r0)+, r2
	movl	#8, sort[r2]
	movl	(r0)+, r2
	movl	#9, sort[r2]
	movl	(r0)+, r2
	movl	#10, sort[r2]
	movl	(r0)+, r2
	movl	#11, sort[r2]
	movl	(r0)+, r2
	movl	#12, sort[r2]
	movl	(r0)+, r2
	movl	#13, sort[r2]
	movl	4(ap), r0
	moval	sort+4, r2
	movl	#hand_size, r1
20$:	movl	(r2)+, r3
	beql	20$
	movl	temp[r3], (r0)+
	sobgtr	r1, 20$
	ret
	.end
 |