Felisp
84436a5ae0
http://my.svgalib.org/svgalib/svgalib-1.9.25.tar.gz http://my.svgalib.org/svgalib/
344 lines
9 KiB
C
344 lines
9 KiB
C
/* Based on functions in linux/string.h */
|
|
|
|
#include <sys/types.h> /* for size_t */
|
|
#include "../src/ppcmemset.h"
|
|
|
|
#if defined (NO_ASSEMBLY)
|
|
|
|
#define __memcpy(dst,src,n) memcpy((dst),(src),(n))
|
|
#define __memcpy_conventional(dst,src,n) memcpy((dst),(src),(n))
|
|
#define __memcpyb(dst,src,n) memcpy((dst),(src),(n))
|
|
#define __memsetb(dst,c,n) memset((dst),(c),(n))
|
|
#define __memset(dst,c,n) memset((dst),(c),(n))
|
|
|
|
static inline void *__memset4(void *s, int c, size_t count) {
|
|
int i, *p=s;
|
|
for(i=0;i<count;i++)*p++=c;
|
|
return s;
|
|
}
|
|
|
|
static inline void *__memset2(void *s, short c, size_t count) {
|
|
short *p=s;
|
|
int i;
|
|
for(i=0;i<count;i++)*p++=c;
|
|
return s;
|
|
}
|
|
|
|
static inline void *__memset3(void *s, int c, size_t count) {
|
|
unsigned char *p=s;
|
|
int i;
|
|
for(i=0;i<count;i++) {
|
|
*p++=c&0xff;
|
|
*p++=(c>>8)&0xff;
|
|
*p++=(c>>16)&0xff;
|
|
}
|
|
return s;
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void *
|
|
__memcpy_conventional(void *to, const void *from, size_t n)
|
|
{
|
|
int dummy1;
|
|
long dummy2, dummy3;
|
|
__asm__ __volatile__("cld\n\t"
|
|
"cmpl $0,%%edx\n\t"
|
|
"jle 2f\n\t"
|
|
"movl %%edi,%%ecx\n\t"
|
|
"andl $1,%%ecx\n\t"
|
|
"subl %%ecx,%%edx\n\t"
|
|
"rep ; movsb\n\t" /* 16-bit align destination */
|
|
"movl %%edx,%%ecx\n\t"
|
|
"shrl $2,%%ecx\n\t"
|
|
"jz 3f\n\t"
|
|
"rep ; movsl\n\t"
|
|
"3:\n\t"
|
|
"testb $1,%%dl\n\t"
|
|
"je 1f\n\t"
|
|
"movsb\n"
|
|
"1:\ttestb $2,%%dl\n\t"
|
|
"je 2f\n\t"
|
|
"movsw\n"
|
|
"2:\n"
|
|
: "=d"(dummy1), "=D"(dummy2), "=S"(dummy3) /* fake output */
|
|
: "0"(n), "1"((long) to), "2"((long) from)
|
|
: "cx"/***rjr***, "dx", "di", "si"***/
|
|
);
|
|
return (to);
|
|
}
|
|
|
|
|
|
static inline void *
|
|
__memcpyb(void *to, const void *from, size_t n)
|
|
{
|
|
int dummy1;
|
|
long dummy2, dummy3;
|
|
__asm__ __volatile__("cld\n\t"
|
|
"rep ; movsb\n\t"
|
|
: "=c"(dummy1), "=D"(dummy2), "=S"(dummy3) /* fake output */
|
|
: "0"(n), "1"((long) to), "2"((long) from)
|
|
/***rjr***: "cx", "di", "si"***/
|
|
);
|
|
return (to);
|
|
}
|
|
|
|
static inline void *
|
|
__memsetb(void *s, char c, size_t count)
|
|
{
|
|
__asm__("cld\n\t"
|
|
"rep\n\t"
|
|
"stosb"
|
|
: : "a"(c), "D"(s), "c"(count)
|
|
: "cx", "di");
|
|
return s;
|
|
}
|
|
|
|
static inline void *
|
|
__memset4(void *s, unsigned c, size_t count)
|
|
{
|
|
long dummy1;
|
|
int dummy2;
|
|
__asm__ __volatile__("cld\n\t"
|
|
"rep\n\t"
|
|
"stosl"
|
|
: "=D"(dummy1), "=c"(dummy2) /* fake outputs */
|
|
: "a"(c), "0"(s), "1"(count)
|
|
/***rjr***: "cx", "di"***/
|
|
);
|
|
return s;
|
|
}
|
|
|
|
static inline void *
|
|
__memset(void *s, char c, size_t count)
|
|
{
|
|
int dummy1;
|
|
long dummy2;
|
|
int dummy3;
|
|
__asm__ __volatile__(
|
|
"cld\n\t"
|
|
"cmpl $12,%%edx\n\t"
|
|
"jl 1f\n\t" /* if (count >= 12) */
|
|
|
|
"movzbl %%al,%%eax\n\t"
|
|
"movl %%eax,%%ecx\n\t"
|
|
"shll $8,%%ecx\n\t" /* c |= c << 8 */
|
|
"orl %%ecx,%%eax\n\t"
|
|
"movl %%eax,%%ecx\n\t"
|
|
"shll $16,%%ecx\n\t" /* c |= c << 16 */
|
|
"orl %%ecx,%%eax\n\t"
|
|
|
|
"movl %%edx,%%ecx\n\t"
|
|
"negl %%ecx\n\t"
|
|
"andl $3,%%ecx\n\t" /* (-s % 4) */
|
|
"subl %%ecx,%%edx\n\t" /* count -= (-s % 4) */
|
|
"rep ; stosb\n\t" /* align to longword boundary */
|
|
|
|
"movl %%edx,%%ecx\n\t"
|
|
"shrl $2,%%ecx\n\t"
|
|
"rep ; stosl\n\t" /* fill longwords */
|
|
|
|
"andl $3,%%edx\n" /* fill last few bytes */
|
|
"1:\tmovl %%edx,%%ecx\n\t" /* <= 12 entry point */
|
|
"rep ; stosb\n\t"
|
|
: "=a"(dummy1), "=D"(dummy2), "=d"(dummy3) /* fake outputs */
|
|
: "0"(c), "1"(s), "2"(count)
|
|
: /***rjr***"ax",*/ "cx"/*, "dx", "di"*/
|
|
);
|
|
return s;
|
|
}
|
|
|
|
static inline void *
|
|
__memset2(void *s, short c, size_t count)
|
|
/* count is in 16-bit pixels */
|
|
/* s is assumed to be 16-bit aligned */
|
|
{
|
|
int dummy1;
|
|
long dummy2;
|
|
int dummy3;
|
|
__asm__ __volatile__(
|
|
"cld\n\t"
|
|
"cmpl $12,%%edx\n\t"
|
|
"jl 1f\n\t" /* if (count >= 12) */
|
|
|
|
"movzwl %%ax,%%eax\n\t"
|
|
"movl %%eax,%%ecx\n\t"
|
|
"shll $16,%%ecx\n\t" /* c |= c << 16 */
|
|
"orl %%ecx,%%eax\n\t"
|
|
|
|
"movl %%edi,%%ecx\n\t"
|
|
"andl $2,%%ecx\n\t" /* s & 2 */
|
|
"jz 2f\n\t"
|
|
"decl %%edx\n\t" /* count -= 1 */
|
|
"movw %%ax,(%%edi)\n\t" /* align to longword boundary */
|
|
"addl $2,%%edi\n\t"
|
|
|
|
"2:\n\t"
|
|
"movl %%edx,%%ecx\n\t"
|
|
"shrl $1,%%ecx\n\t"
|
|
"rep ; stosl\n\t" /* fill longwords */
|
|
|
|
"andl $1,%%edx\n" /* one 16-bit word left? */
|
|
"jz 3f\n\t" /* no, finished */
|
|
"1:\tmovl %%edx,%%ecx\n\t" /* <= 12 entry point */
|
|
"rep ; stosw\n\t"
|
|
"3:\n\t"
|
|
: "=a"(dummy1), "=D"(dummy2), "=d"(dummy3) /* fake outputs */
|
|
: "0"(c), "1"(s), "2"(count)
|
|
: /***rjr***"ax",*/ "cx"/*, "dx", "di"*/
|
|
);
|
|
return s;
|
|
}
|
|
|
|
static inline void *
|
|
__memset3(void *s, int c, size_t count)
|
|
/* count is in 24-bit pixels (3 bytes per pixel) */
|
|
{
|
|
int dummy1;
|
|
long dummy2;
|
|
int dummy3;
|
|
__asm__ __volatile__(
|
|
"cmpl $8,%%edx\n\t"
|
|
/* "jmp 2f\n\t" *//* debug */
|
|
"jl 2f\n\t"
|
|
|
|
"movl %%eax,%%esi\n\t" /* esi = (low) BGR0 (high) */
|
|
"shll $24,%%eax\n\t" /* eax = 000B */
|
|
"orl %%eax,%%esi\n\t" /* esi = BGRB */
|
|
|
|
"movl %%esi,%%eax\n\t"
|
|
"shrl $8,%%eax\n\t" /* eax = GRB0 */
|
|
"movl %%eax,%%ecx\n\t"
|
|
"shll $24,%%ecx\n\t" /* ecx = 000G */
|
|
"orl %%ecx,%%eax\n\t" /* eax = GRBG */
|
|
|
|
"movl %%esi,%%ecx\n\t"
|
|
"shll $8,%%ecx\n\t" /* ecx = 0BGR */
|
|
"movb %%ah,%%cl\n\t" /* ecx = RBGR */
|
|
|
|
"cmpl $16,%%edx\n\t"
|
|
"jl 1f\n\t"
|
|
"jmp 5f\n\t"
|
|
".align 4,0x90\n\t"
|
|
|
|
"5:\n\t" /* loop unrolling */
|
|
"movl %%esi,(%%edi)\n\t" /* write BGRB */
|
|
"movl %%eax,4(%%edi)\n\t" /* write GRBG */
|
|
"movl %%ecx,8(%%edi)\n\t" /* write RBGR */
|
|
"movl %%esi,12(%%edi)\n\t"
|
|
"movl %%eax,16(%%edi)\n\t"
|
|
"movl %%ecx,20(%%edi)\n\t"
|
|
"movl %%esi,24(%%edi)\n\t"
|
|
"movl %%eax,28(%%edi)\n\t"
|
|
"movl %%ecx,32(%%edi)\n\t"
|
|
"movl %%esi,36(%%edi)\n\t"
|
|
"subl $16,%%edx\n\t" /* blend end-of-loop instr. */
|
|
"movl %%eax,40(%%edi)\n\t"
|
|
"movl %%ecx,44(%%edi)\n\t"
|
|
"addl $48,%%edi\n\t"
|
|
"cmpl $16,%%edx\n\t"
|
|
"jge 5b\n\t"
|
|
"andl %%edx,%%edx\n\t"
|
|
"jz 4f\n\t" /* finished */
|
|
"cmpl $4,%%edx\n\t"
|
|
"jl 2f\n\t" /* less than 4 pixels left */
|
|
"jmp 1f\n\t"
|
|
".align 4,0x90\n\t"
|
|
|
|
"1:\n\t"
|
|
"movl %%esi,(%%edi)\n\t" /* write BGRB */
|
|
"movl %%eax,4(%%edi)\n\t" /* write GRBG */
|
|
"movl %%ecx,8(%%edi)\n\t" /* write RBGR */
|
|
"addl $12,%%edi\n\t"
|
|
"subl $4,%%edx\n\t"
|
|
"cmpl $4,%%edx\n\t"
|
|
"jge 1b\n\t"
|
|
|
|
"2:\n\t"
|
|
"cmpl $0,%%edx\n\t" /* none left? */
|
|
"jle 4f\n\t" /* finished */
|
|
|
|
"mov %%ecx,%%eax\n\t"
|
|
"shrl $8,%%ecx\n\t" /* R in cl */
|
|
|
|
"3:\n\t" /* write last few pixels */
|
|
"movw %%cx,(%%edi)\n\t" /* write BG */
|
|
"movb %%al,2(%%edi)\n\t" /* write R */
|
|
"addl $3,%%edi\n\t"
|
|
"decl %%edx\n\t"
|
|
"jnz 3b\n\t"
|
|
|
|
"4:\n\t"
|
|
: "=a"(dummy1), "=D"(dummy2), "=d"(dummy3) /* fake outputs */
|
|
: "0"(c), "1"(s), "2"(count)
|
|
: /***rjr***"ax",*/ "cx", /*"dx",*/ "si"/*, "di"*/
|
|
);
|
|
return s;
|
|
}
|
|
|
|
/* Functions for which arguments must be passed in %ebx, %edx, and %ecx. */
|
|
#if 0 /* Why declare 'em? Just confuses the compiler and can't be called from C
|
|
anyway */
|
|
extern __memcpyasm_regargs(); /* nu_bytes >= 3 */
|
|
extern __memcpyasm_regargs_aligned(); /* nu_bytes >= 32 */
|
|
#endif
|
|
|
|
|
|
/* Always 32-bit align destination, even for a small number of bytes. */
|
|
static inline void *
|
|
__memcpy_aligndest(void *dest, const void *src, int n)
|
|
{
|
|
__asm__ __volatile__("cmpl $3, %%ecx\n\t"
|
|
"ja 1f\n\t"
|
|
"call * __memcpy_jumptable (, %%ecx, 4)\n\t"
|
|
"jmp 2f\n\t"
|
|
"1:call __memcpyasm_regargs\n\t"
|
|
"2:":
|
|
:"S"(dest), "d"(src), "c"(n)
|
|
:"ax", "0", "1", "2");
|
|
return dest;
|
|
}
|
|
|
|
|
|
/* Optimized version for 32-bit aligned destination. */
|
|
static inline void *
|
|
__memcpy_destaligned(void *dest, const void *src, int n)
|
|
{
|
|
__asm__ __volatile__("cmpl $32, %%ecx\n\t"
|
|
"ja 1f\n\t"
|
|
"call * __memcpy_jumptable (, %%ecx, 4)\n\t"
|
|
"jmp 2f\n\t"
|
|
"1:call __memcpyasm_regargs_aligned\n\t"
|
|
"2:\n\t":
|
|
:"S"(dest), "d"(src), "c"(n)
|
|
:"ax", "0", "1", "2");
|
|
return dest;
|
|
}
|
|
|
|
|
|
/* Balanced inline memcpy; 32-bit align destination if nu_bytes >= 20. */
|
|
static inline void *
|
|
__memcpy_balanced(void *dest, const void *src, int n)
|
|
{
|
|
__asm__ __volatile__("cmpl $19, %%ecx\n\t"
|
|
"ja 1f\n\t"
|
|
"call * __memcpy_jumptable (, %%ecx, 4)\n\t"
|
|
"jmp 2f\n\t"
|
|
"1:call __memcpyasm_regargs\n\t"
|
|
"2:\n\t"
|
|
:
|
|
:"S"((long) dest), "d"((long) src), "c"((long) n)
|
|
:"ax", "0", "1", "2");
|
|
return dest;
|
|
}
|
|
|
|
|
|
#define __memcpy __memcpy_conventional
|
|
|
|
#endif
|
|
|
|
/* Functions defined in mem.S or mem.c */
|
|
|
|
extern void __svgalib_memcpy4to3(void *dest, void *src, int n);
|
|
extern void __svgalib_memcpy32shift8(void *dest, void *src, int n);
|
|
|