SvgaLib/gl/inlstring.h

344 lines
9 KiB
C

/* Based on functions in linux/string.h */
#include <sys/types.h> /* for size_t */
#include "../src/ppcmemset.h"
#if defined (NO_ASSEMBLY)
#define __memcpy(dst,src,n) memcpy((dst),(src),(n))
#define __memcpy_conventional(dst,src,n) memcpy((dst),(src),(n))
#define __memcpyb(dst,src,n) memcpy((dst),(src),(n))
#define __memsetb(dst,c,n) memset((dst),(c),(n))
#define __memset(dst,c,n) memset((dst),(c),(n))
static inline void *__memset4(void *s, int c, size_t count) {
int i, *p=s;
for(i=0;i<count;i++)*p++=c;
return s;
}
static inline void *__memset2(void *s, short c, size_t count) {
short *p=s;
int i;
for(i=0;i<count;i++)*p++=c;
return s;
}
static inline void *__memset3(void *s, int c, size_t count) {
unsigned char *p=s;
int i;
for(i=0;i<count;i++) {
*p++=c&0xff;
*p++=(c>>8)&0xff;
*p++=(c>>16)&0xff;
}
return s;
}
#else
static inline void *
__memcpy_conventional(void *to, const void *from, size_t n)
{
int dummy1;
long dummy2, dummy3;
__asm__ __volatile__("cld\n\t"
"cmpl $0,%%edx\n\t"
"jle 2f\n\t"
"movl %%edi,%%ecx\n\t"
"andl $1,%%ecx\n\t"
"subl %%ecx,%%edx\n\t"
"rep ; movsb\n\t" /* 16-bit align destination */
"movl %%edx,%%ecx\n\t"
"shrl $2,%%ecx\n\t"
"jz 3f\n\t"
"rep ; movsl\n\t"
"3:\n\t"
"testb $1,%%dl\n\t"
"je 1f\n\t"
"movsb\n"
"1:\ttestb $2,%%dl\n\t"
"je 2f\n\t"
"movsw\n"
"2:\n"
: "=d"(dummy1), "=D"(dummy2), "=S"(dummy3) /* fake output */
: "0"(n), "1"((long) to), "2"((long) from)
: "cx"/***rjr***, "dx", "di", "si"***/
);
return (to);
}
static inline void *
__memcpyb(void *to, const void *from, size_t n)
{
int dummy1;
long dummy2, dummy3;
__asm__ __volatile__("cld\n\t"
"rep ; movsb\n\t"
: "=c"(dummy1), "=D"(dummy2), "=S"(dummy3) /* fake output */
: "0"(n), "1"((long) to), "2"((long) from)
/***rjr***: "cx", "di", "si"***/
);
return (to);
}
static inline void *
__memsetb(void *s, char c, size_t count)
{
__asm__("cld\n\t"
"rep\n\t"
"stosb"
: : "a"(c), "D"(s), "c"(count)
: "cx", "di");
return s;
}
static inline void *
__memset4(void *s, unsigned c, size_t count)
{
long dummy1;
int dummy2;
__asm__ __volatile__("cld\n\t"
"rep\n\t"
"stosl"
: "=D"(dummy1), "=c"(dummy2) /* fake outputs */
: "a"(c), "0"(s), "1"(count)
/***rjr***: "cx", "di"***/
);
return s;
}
static inline void *
__memset(void *s, char c, size_t count)
{
int dummy1;
long dummy2;
int dummy3;
__asm__ __volatile__(
"cld\n\t"
"cmpl $12,%%edx\n\t"
"jl 1f\n\t" /* if (count >= 12) */
"movzbl %%al,%%eax\n\t"
"movl %%eax,%%ecx\n\t"
"shll $8,%%ecx\n\t" /* c |= c << 8 */
"orl %%ecx,%%eax\n\t"
"movl %%eax,%%ecx\n\t"
"shll $16,%%ecx\n\t" /* c |= c << 16 */
"orl %%ecx,%%eax\n\t"
"movl %%edx,%%ecx\n\t"
"negl %%ecx\n\t"
"andl $3,%%ecx\n\t" /* (-s % 4) */
"subl %%ecx,%%edx\n\t" /* count -= (-s % 4) */
"rep ; stosb\n\t" /* align to longword boundary */
"movl %%edx,%%ecx\n\t"
"shrl $2,%%ecx\n\t"
"rep ; stosl\n\t" /* fill longwords */
"andl $3,%%edx\n" /* fill last few bytes */
"1:\tmovl %%edx,%%ecx\n\t" /* <= 12 entry point */
"rep ; stosb\n\t"
: "=a"(dummy1), "=D"(dummy2), "=d"(dummy3) /* fake outputs */
: "0"(c), "1"(s), "2"(count)
: /***rjr***"ax",*/ "cx"/*, "dx", "di"*/
);
return s;
}
static inline void *
__memset2(void *s, short c, size_t count)
/* count is in 16-bit pixels */
/* s is assumed to be 16-bit aligned */
{
int dummy1;
long dummy2;
int dummy3;
__asm__ __volatile__(
"cld\n\t"
"cmpl $12,%%edx\n\t"
"jl 1f\n\t" /* if (count >= 12) */
"movzwl %%ax,%%eax\n\t"
"movl %%eax,%%ecx\n\t"
"shll $16,%%ecx\n\t" /* c |= c << 16 */
"orl %%ecx,%%eax\n\t"
"movl %%edi,%%ecx\n\t"
"andl $2,%%ecx\n\t" /* s & 2 */
"jz 2f\n\t"
"decl %%edx\n\t" /* count -= 1 */
"movw %%ax,(%%edi)\n\t" /* align to longword boundary */
"addl $2,%%edi\n\t"
"2:\n\t"
"movl %%edx,%%ecx\n\t"
"shrl $1,%%ecx\n\t"
"rep ; stosl\n\t" /* fill longwords */
"andl $1,%%edx\n" /* one 16-bit word left? */
"jz 3f\n\t" /* no, finished */
"1:\tmovl %%edx,%%ecx\n\t" /* <= 12 entry point */
"rep ; stosw\n\t"
"3:\n\t"
: "=a"(dummy1), "=D"(dummy2), "=d"(dummy3) /* fake outputs */
: "0"(c), "1"(s), "2"(count)
: /***rjr***"ax",*/ "cx"/*, "dx", "di"*/
);
return s;
}
static inline void *
__memset3(void *s, int c, size_t count)
/* count is in 24-bit pixels (3 bytes per pixel) */
{
int dummy1;
long dummy2;
int dummy3;
__asm__ __volatile__(
"cmpl $8,%%edx\n\t"
/* "jmp 2f\n\t" *//* debug */
"jl 2f\n\t"
"movl %%eax,%%esi\n\t" /* esi = (low) BGR0 (high) */
"shll $24,%%eax\n\t" /* eax = 000B */
"orl %%eax,%%esi\n\t" /* esi = BGRB */
"movl %%esi,%%eax\n\t"
"shrl $8,%%eax\n\t" /* eax = GRB0 */
"movl %%eax,%%ecx\n\t"
"shll $24,%%ecx\n\t" /* ecx = 000G */
"orl %%ecx,%%eax\n\t" /* eax = GRBG */
"movl %%esi,%%ecx\n\t"
"shll $8,%%ecx\n\t" /* ecx = 0BGR */
"movb %%ah,%%cl\n\t" /* ecx = RBGR */
"cmpl $16,%%edx\n\t"
"jl 1f\n\t"
"jmp 5f\n\t"
".align 4,0x90\n\t"
"5:\n\t" /* loop unrolling */
"movl %%esi,(%%edi)\n\t" /* write BGRB */
"movl %%eax,4(%%edi)\n\t" /* write GRBG */
"movl %%ecx,8(%%edi)\n\t" /* write RBGR */
"movl %%esi,12(%%edi)\n\t"
"movl %%eax,16(%%edi)\n\t"
"movl %%ecx,20(%%edi)\n\t"
"movl %%esi,24(%%edi)\n\t"
"movl %%eax,28(%%edi)\n\t"
"movl %%ecx,32(%%edi)\n\t"
"movl %%esi,36(%%edi)\n\t"
"subl $16,%%edx\n\t" /* blend end-of-loop instr. */
"movl %%eax,40(%%edi)\n\t"
"movl %%ecx,44(%%edi)\n\t"
"addl $48,%%edi\n\t"
"cmpl $16,%%edx\n\t"
"jge 5b\n\t"
"andl %%edx,%%edx\n\t"
"jz 4f\n\t" /* finished */
"cmpl $4,%%edx\n\t"
"jl 2f\n\t" /* less than 4 pixels left */
"jmp 1f\n\t"
".align 4,0x90\n\t"
"1:\n\t"
"movl %%esi,(%%edi)\n\t" /* write BGRB */
"movl %%eax,4(%%edi)\n\t" /* write GRBG */
"movl %%ecx,8(%%edi)\n\t" /* write RBGR */
"addl $12,%%edi\n\t"
"subl $4,%%edx\n\t"
"cmpl $4,%%edx\n\t"
"jge 1b\n\t"
"2:\n\t"
"cmpl $0,%%edx\n\t" /* none left? */
"jle 4f\n\t" /* finished */
"mov %%ecx,%%eax\n\t"
"shrl $8,%%ecx\n\t" /* R in cl */
"3:\n\t" /* write last few pixels */
"movw %%cx,(%%edi)\n\t" /* write BG */
"movb %%al,2(%%edi)\n\t" /* write R */
"addl $3,%%edi\n\t"
"decl %%edx\n\t"
"jnz 3b\n\t"
"4:\n\t"
: "=a"(dummy1), "=D"(dummy2), "=d"(dummy3) /* fake outputs */
: "0"(c), "1"(s), "2"(count)
: /***rjr***"ax",*/ "cx", /*"dx",*/ "si"/*, "di"*/
);
return s;
}
/* Functions for which arguments must be passed in %ebx, %edx, and %ecx. */
#if 0 /* Why declare 'em? Just confuses the compiler and can't be called from C
anyway */
extern __memcpyasm_regargs(); /* nu_bytes >= 3 */
extern __memcpyasm_regargs_aligned(); /* nu_bytes >= 32 */
#endif
/* Always 32-bit align destination, even for a small number of bytes. */
static inline void *
__memcpy_aligndest(void *dest, const void *src, int n)
{
__asm__ __volatile__("cmpl $3, %%ecx\n\t"
"ja 1f\n\t"
"call * __memcpy_jumptable (, %%ecx, 4)\n\t"
"jmp 2f\n\t"
"1:call __memcpyasm_regargs\n\t"
"2:":
:"S"(dest), "d"(src), "c"(n)
:"ax", "0", "1", "2");
return dest;
}
/* Optimized version for 32-bit aligned destination. */
static inline void *
__memcpy_destaligned(void *dest, const void *src, int n)
{
__asm__ __volatile__("cmpl $32, %%ecx\n\t"
"ja 1f\n\t"
"call * __memcpy_jumptable (, %%ecx, 4)\n\t"
"jmp 2f\n\t"
"1:call __memcpyasm_regargs_aligned\n\t"
"2:\n\t":
:"S"(dest), "d"(src), "c"(n)
:"ax", "0", "1", "2");
return dest;
}
/* Balanced inline memcpy; 32-bit align destination if nu_bytes >= 20. */
static inline void *
__memcpy_balanced(void *dest, const void *src, int n)
{
__asm__ __volatile__("cmpl $19, %%ecx\n\t"
"ja 1f\n\t"
"call * __memcpy_jumptable (, %%ecx, 4)\n\t"
"jmp 2f\n\t"
"1:call __memcpyasm_regargs\n\t"
"2:\n\t"
:
:"S"((long) dest), "d"((long) src), "c"((long) n)
:"ax", "0", "1", "2");
return dest;
}
#define __memcpy __memcpy_conventional
#endif
/* Functions defined in mem.S or mem.c */
extern void __svgalib_memcpy4to3(void *dest, void *src, int n);
extern void __svgalib_memcpy32shift8(void *dest, void *src, int n);