Again: Kernel speed and inlined string.h routines

Again: Kernel speed and inlined string.h routines

Post by Werner Fi » Wed, 03 May 1995 04:00:00



I've found in:

    sunsite.unc.edu/pub/Linux/Kernel/patches/misc/lx10inline_v2.tgz

inlined assembler for library functions (memcpy etc.) for linux on
i486 CPU's.  Including the functions (made for kernel version 1.0.9)
into kernel 1.2.5 really increase the kernel speed on my machine.

Note: I'm _NOT_ the author of the assembler functions. For more
      information please read the README from the authors Alberto Vignani
      and Davide Parodi in the original source lx10inline_v2.tgz!

I've also enclosed a patch from Robert Wilhelm
<Robert.Wilh...@Physik.TU-Muenchen.DE> on the inlined assembler function
ntohl in linux/include/asm-i386/byteorder.h.

The enclosed patch modifies the files
  linux/arch/i386/config.in
  linux/lib/string.c
  linux/include/asm-i386/string.h
  linux/include/asm-i386/byteorder.h
against 1.2.6+.

After patching run `make config'
with enabled 486-specific optimizations _and_ enabled assembler optimized
library.
Note: After setting CONFIG_M486 _and_ CONFIG_ASM_OPTIMIZE the new kernel
      only works on 486 or 586 CPU's!

Werner

PS: Todo: Including more functions like bcopy;
          Better optimizing of the C-code part of memcpy
          and memset in linux/include/asm-i386/string.h
          (like the i386 versions written by Linus)
------------------------------------------------------------------------
diff -u5 linux/arch/i386/config.in.oldd linux/arch/i386/config.in
--- linux/arch/i386/config.in.oldd      Sun Apr 30 21:25:11 1995
+++ linux/arch/i386/config.in   Tue May  2 19:20:03 1995
@@ -33,10 +33,14 @@
 #bool 'Use -mpentium flag for Pentium-specific optimizations' CONFIG_M586 n
 #if [ "$CONFIG_M586" = "n" ]; then
 bool 'Use -m486 flag for 486-specific optimizations' CONFIG_M486 y
 #fi

+comment 'Using of assembler optimized library (experimental)'
+comment 'combined with 486-specific optimizations only on 486 or 586'
+bool 'Assembler optimized library' CONFIG_ASM_OPTIMIZE y
+
 comment 'Loadable module support'
 bool 'Set version information on all symbols for modules' CONFIG_MODVERSIONS n

 if [ "$CONFIG_NET" = "y" ]; then
 comment 'Networking options'
diff -u5 linux/lib/string.c.oldd linux/lib/string.c
--- linux/lib/string.c.oldd     Wed Apr 19 11:13:35 1995
+++ linux/lib/string.c  Tue May  2 19:21:21 1995
@@ -9,10 +9,18 @@
  * as inline code in <asm-xx/string.h>
  *
  * These are buggy as well..
  */

+#if defined(CONFIG_ASM_OPTIMIZE)
+
+#include <linux/string.h>
+
+char * ___strtok = NULL;
+
+#else /* CONFIG_ASM_OPTIMIZE */
+
 #include <linux/types.h>

 char * ___strtok = NULL;

 char * strcpy(char * dest,const char *src)
@@ -239,5 +247,7 @@
                p++;
                size--;
        }
        return (void *) p;
 }
+
+#endif /* CONFIG_ASM_OPTIMIZE */
diff -u5 linux/include/asm-i386/string.h.oldd linux/include/asm-i386/string.h
--- linux/include/asm-i386/string.h.oldd        Mon Jan  9 05:33:23 1995
+++ linux/include/asm-i386/string.h     Tue May  2 19:06:53 1995
@@ -8,28 +8,72 @@
  * see especially strtok,strstr,str[c]spn. They should work, but are not
  * very easy to understand. Everything is done entirely within the register
  * set, making the functions fast and clean. String instructions have been
  * used through-out, making for "slightly" unclear code :-)
  *
- *             Copyright (C) 1991, 1992 Linus Torvalds
+ *             Copyright (C) 1991,1992,1993,1994 Linus Torvalds
+ *             Revised and optimized for i486/pentium
+ *             1994/03/15 by Alberto Vignani/Davide Parodi @crf.it
  */

 extern inline char * strcpy(char * dest,const char *src)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register char *tmp= (char *)dest;
+  register char dummy;
+  __asm__ __volatile__
+    ("\n1:\t"
+     "movb (%0),%2\n\t"
+     "incl %0\n\t"
+     "movb %2,(%1)\n\t"
+     "incl %1\n\t"
+     "testb %2,%2\n\t"
+     "jne 1b"
+     :"=r" (src), "=r" (tmp), "=q" (dummy)
+     :"0" (src), "1" (tmp)
+     :"memory");
+  return dest;
+#else /* CONFIG_M486 || CONFIG_M586 */
 __asm__ __volatile__(
        "cld\n"
        "1:\tlodsb\n\t"
        "stosb\n\t"
        "testb %%al,%%al\n\t"
        "jne 1b"
        : /* no output */
        :"S" (src),"D" (dest):"si","di","ax","memory");
 return dest;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline char * strncpy(char * dest,const char *src,size_t count)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register char *tmp= (char *)dest;
+  register char dummy;
+  if (count) {
+  __asm__ __volatile__ (
+        "\n1:\t"
+        "movb (%0),%2\n\t"
+        "incl %0\n\t"
+        "movb %2,(%1)\n\t"
+        "incl %1\n\t"
+        "decl %3\n\t"
+        "je 3f\n\t"
+        "testb %2,%2\n\t"
+        "jne 1b\n\t"
+        "2:\tmovb %2,(%1)\n\t"
+        "incl %1\n\t"
+        "decl %3\n\t"
+        "jne 2b\n\t"
+        "3:"
+        :"=r" (src), "=r" (tmp), "=q" (dummy), "=r" (count)
+        :"0" (src), "1" (tmp), "3" (count)
+        :"memory");
+  }
+  return dest;
+#else /* CONFIG_M486 || CONFIG_M586 */
 __asm__ __volatile__(
        "cld\n"
        "1:\tdecl %2\n\t"
        "js 2f\n\t"
        "lodsb\n\t"
@@ -40,14 +84,33 @@
        "stosb\n"
        "2:"
        : /* no output */
        :"S" (src),"D" (dest),"c" (count):"si","di","ax","cx","memory");
 return dest;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline char * strcat(char * dest,const char * src)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register char *tmp = (char *)(dest-1);
+  register char dummy;
+  __asm__ __volatile__
+    ("\n1:\tincl %1\n\t"
+     "cmpb $0,(%1)\n\t"
+     "jne 1b\n"
+     "2:\tmovb (%2),%b0\n\t"
+     "incl %2\n\t"
+     "movb %b0,(%1)\n\t"
+     "incl %1\n\t"
+     "testb %b0,%b0\n\t"
+     "jne 2b\n"
+     :"=q" (dummy), "=r" (tmp), "=r" (src)
+     :"1"  (tmp), "2"  (src)
+     :"memory");
+  return dest;
+#else /* CONFIG_M486 || CONFIG_M586 */
 __asm__ __volatile__(
        "cld\n\t"
        "repne\n\t"
        "scasb\n\t"
        "decl %1\n"
@@ -56,14 +119,37 @@
        "testb %%al,%%al\n\t"
        "jne 1b"
        : /* no output */
        :"S" (src),"D" (dest),"a" (0),"c" (0xffffffff):"si","di","ax","cx");
 return dest;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline char * strncat(char * dest,const char * src,size_t count)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register char *tmp = (char *)(dest-1);
+  register char dummy;
+  __asm__ __volatile__
+    ("\n1:\tincl %1\n\t"
+     "cmpb $0,(%1)\n\t"
+     "jne 1b\n"
+     "2:\tdecl %3\n\t"
+     "js 3f\n\t"
+     "movb (%2),%b0\n\t"
+     "incl %2\n\t"
+     "movb %b0,(%1)\n\t"
+     "incl %1\n\t"
+     "testb %b0,%b0\n\t"
+     "jne 2b\n"
+     "3:\txorl %0,%0\n\t"
+     "movb %b0,(%1)\n\t"
+     :"=q" (dummy), "=r" (tmp), "=r" (src), "=r" (count)
+     :"1"  (tmp), "2"  (src), "3"  (count)
+     :"memory");
+  return dest;
+#else /* CONFIG_M486 || CONFIG_M586 */
 __asm__ __volatile__(
        "cld\n\t"
        "repne\n\t"
        "scasb\n\t"
        "decl %1\n\t"
@@ -78,14 +164,36 @@
        "stosb"
        : /* no output */
        :"S" (src),"D" (dest),"a" (0),"c" (0xffffffff),"g" (count)
        :"si","di","ax","cx","memory");
 return dest;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline int strcmp(const char * cs,const char * ct)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register int __res;
+  __asm__ __volatile__
+    ("\n1:\tmovb (%1),%b0\n\t"
+     "incl %1\n\t"
+     "cmpb %b0,(%2)\n\t"
+     "jne 2f\n\t"
+     "incl %2\n\t"
+     "testb %b0,%b0\n\t"
+     "jne 1b\n\t"
+     "xorl %0,%0\n\t"
+     "jmp 3f\n"
+     "2:\tmovl $1,%0\n\t"
+     "jb 3f\n\t"
+     "negl %0\n"
+     "3:"
+     :"=q" (__res), "=r" (cs), "=r" (ct)
+     :"1" (cs), "2" (ct)
+     : "memory" );
+  return __res;
+#else /* CONFIG_M486 || CONFIG_M586 */
 register int __res;
 __asm__ __volatile__(
        "cld\n"
        "1:\tlodsb\n\t"
        "scasb\n\t"
@@ -97,14 +205,37 @@
        "2:\tsbbl %%eax,%%eax\n\t"
        "orb $1,%%eax\n"
        "3:"
        :"=a" (__res):"S" (cs),"D" (ct):"si","di");
 return __res;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline int strncmp(const char * cs,const char * ct,size_t count)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register int __res;
+  __asm__ __volatile__
+    ("\n1:\tdecl %3\n\t"
+     "js 2f\n\t"
+     "movb (%1),%b0\n\t"
+     "incl %1\n\t"
+     "cmpb %b0,(%2)\n\t"
+     "jne 3f\n\t"
+     "incl %2\n\t"
+     "testb %b0,%b0\n\t"
+     "jne 1b\n"
+     "2:\txorl %0,%0\n\t"
+     "jmp 4f\n"
+     "3:\tmovl $1,%0\n\t"
+     "jb 4f\n\t"
+     "negl %0\n"
+     "4:"
+     :"=q" (__res), "=r" (cs), "=r" (ct), "=r" (count)
+     :"1"  (cs), "2"  (ct),  "3" (count));
+  return __res;
+#else /* CONFIG_M486 || CONFIG_M586 */
 register int __res;
 __asm__ __volatile__(
        "cld\n"
        "1:\tdecl %3\n\t"
        "js 2f\n\t"
@@ -118,14 +249,31 @@
        "3:\tsbbl %%eax,%%eax\n\t"
        "orb $1,%%al\n"
        "4:"
        :"=a" (__res):"S" (cs),"D" (ct),"c" (count):"si","di","cx");
 return __res;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline char * strchr(const char * s, int c)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register char * __res;
+  __asm__ __volatile__
+    ("movb %%al,%%ah\n"
+     "1:\tmovb (%1),%%al\n\t"
+     "cmpb %%ah,%%al\n\t"
+     "je 2f\n\t"
+     "incl %1\n\t"
+     "testb %%al,%%al\n\t"
+     "jne 1b\n\t"
+     "xorl %1,%1\n"
+     "2:\tmovl %1,%0\n\t"
+     :"=a" (__res), "=r" (s)
+     :"0" (c),      "1"  (s));
+  return __res;
+#else /* CONFIG_M486 || CONFIG_M586 */
 register char * __res;
 __asm__ __volatile__(
        "cld\n\t"
        "movb %%al,%%ah\n"
        "1:\tlodsb\n\t"
@@ -136,14 +284,30 @@
        "movl $1,%1\n"
        "2:\tmovl %1,%0\n\t"
        "decl %0"
        :"=a" (__res):"S" (s),"0" (c):"si");
 return __res;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline char * strrchr(const char * s, int c)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register char * __res;
+  __asm__ __volatile__
+    ("movb %b2,%h2\n"
+     "1:\tmovb (%1),%b2\n\t"
+     "cmpb %h2,%b2\n\t"
+     "jne 2f\n\t"
+     "movl %1,%0\n"
+     "2:\tincl %1\n\t"
+     "testb %b2,%h2\n\t"
+     "jne 1b"
+     :"=r" (__res), "=r" (s), "=q" (c)
+     :"0" (0), "1"  (s), "2" (c));
+  return __res;
+#else /* CONFIG_M486 || CONFIG_M586 */
 register char * __res;
 __asm__ __volatile__(
        "cld\n\t"
        "movb %%al,%%ah\n"
        "1:\tlodsb\n\t"
@@ -152,10 +316,11 @@
        "leal -1(%%esi),%0\n"
        "2:\ttestb %%al,%%al\n\t"
        "jne 1b"
        :"=d" (__res):"0" (0),"S" (s),"a" (c):"ax","si");
 return __res;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline size_t strspn(const char * cs, const char * ct)
 {
 register char * __res;
@@ -262,19 +427,33 @@
 return __res;
 }

 extern inline size_t strlen(const char * s)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  /* slightly slower on a 486, but with better chances of register allocation */
+  register char dummy, *tmp= (char *)s;
+  __asm__ __volatile__("\n1:\t"
+    "movb\t(%0),%1\n\t"
+    "incl\t%0\n\t"
+    "testb\t%1,%1\n\t"
+    "jne\t1b"
+    :"=r" (tmp),"=q" (dummy)
+    :"0" (s)
+    : "memory" );
+    return (tmp-s-1);
+#else /* CONFIG_M486 || CONFIG_M586 */
 register int __res;
 __asm__ __volatile__(
        "cld\n\t"
        "repne\n\t"
        "scasb\n\t"
        "notl %0\n\t"
        "decl %0"
        :"=c" (__res):"D" (s),"a" (0),"0" (0xffffffff):"di");
 return __res;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern char * ___strtok;

 extern inline char * strtok(char * s,const char * ct)
@@ -335,10 +514,83 @@
        :"0" (___strtok),"1" (s),"g" (ct)
        :"ax","cx","dx","di","memory");
 return __res;
 }

+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+
+#define __memcpy_c(d,s,count) \
+((count%4==0) ? \
+ __memcpy_by4((d),(s),(count)) : \
+ ((count%2==0) ? \
+  __memcpy_by2((d),(s),(count)) : \
+  __memcpy_g((d),(s),(count))))
+
+#define memcpy(d,s,count) \
+(__builtin_constant_p(count) ? \
+ __memcpy_c((d),(s),(count)) : \
+ __memcpy_g((d),(s),(count)))
+
+extern inline void * __memcpy_by4(void * to, const void * from, size_t n)
+{
+  register void *tmp = (void *)to;
+  register int dummy1,dummy2;
+  __asm__ __volatile__
+    ("\n1:\tmovl (%2),%0\n\t"
+     "addl $4,%2\n\t"
+     "movl %0,(%1)\n\t"
+     "addl $4,%1\n\t"
+     "decl %3\n\t"
+     "jnz 1b"
+     :"=r" (dummy1), "=r" (tmp), "=r" (from), "=r" (dummy2)
+     :"1" (tmp), "2" (from), "3" (n/4)
+     :"memory");
+  return (to);
+}
+
+extern inline void * __memcpy_by2(void * to, const void * from, size_t n)
+{
+  register void *tmp = (void *)to;
+  register int dummy1,dummy2;
+  __asm__ __volatile__
+    ("shrl $1,%3\n\t"
+     "jz 2f\n"                 /* only a word */
+     "1:\tmovl (%2),%0\n\t"
+     "addl $4,%2\n\t"
+     "movl %0,(%1)\n\t"
+     "addl $4,%1\n\t"
+     "decl %3\n\t"
+     "jnz 1b\n"
+     "2:\tmovw (%2),%w0\n\t"
+     "movw %w0,(%1)"
+     :"=r" (dummy1), "=r" (tmp), "=r" (from), "=r" (dummy2)
+     :"1" (tmp), "2" (from), "3" (n/2)
+     :"memory");
+  return (to);
+}
+
+extern inline void * __memcpy_g(void * to, const void * from, size_t n)
+{
+  register void *tmp = (void *)to;
+  __asm__ __volatile__
+    ("cld\n\t"
+     "shrl $1,%%ecx\n\t"
+     "jnc 1f\n\t"
+     "movsb\n"
+     "1:\tshrl $1,%%ecx\n\t"
+     "jnc 2f\n\t"
+     "movsw\n"
+     "2:\trep\n\t"
+     "movsl"
+     : /* no output */
+     :"c" (n),"D" ((long) tmp),"S" ((long) from)
+     :"cx","di","si","memory");
+  return (to);
+}
+
+#else /* CONFIG_M486 || CONFIG_M586 */
+
 extern inline void * __memcpy(void * to, const void * from, size_t n)
 {
 __asm__ __volatile__(
        "cld\n\t"
        "movl %%edx, %%ecx\n\t"
@@ -400,12 +652,34 @@
 #define memcpy(t, f, n) \
 (__builtin_constant_p(n) ? \
  __constant_memcpy((t),(f),(n)) : \
  __memcpy((t),(f),(n)))

+#endif /* CONFIG_M486 || CONFIG_M586 */
+
 extern inline void * memmove(void * dest,const void * src, size_t n)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register void *tmp = (void *)dest;
+  if (dest<src)
+    __asm__ __volatile__
+      ("cld\n\t"
+       "rep\n\t"
+       "movsb"
+       : /* no output */
+       :"c" (n),"S" (src),"D" (tmp)
+       :"cx","si","di");
+  else
+    __asm__ __volatile__
+       ("std\n\t"
+        "rep\n\t"
+        "movsb\n\t"
+        : /* no output */
+        :"c" (n), "S" (n-1+(const char *)src), "D" (n-1+(char *)tmp)
+        :"cx","si","di","memory");
+  return dest;
+#else /* CONFIG_M486 || CONFIG_M586 */
 if (dest<src)
 __asm__ __volatile__(
        "cld\n\t"
        "rep\n\t"
        "movsb"
@@ -422,10 +696,11 @@
        :"c" (n),
         "S" (n-1+(const char *)src),
         "D" (n-1+(char *)dest)
        :"cx","si","di","memory");
 return dest;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline int memcmp(const void * cs,const void * ct,size_t count)
 {
 register int __res;
@@ -457,10 +732,147 @@
        :"=D" (__res):"a" (c),"D" (cs),"c" (count)
        :"cx");
 return __res;
 }

+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+
+#define __memset_cc(s,c,count) \
+((count%4==0) ? \
+ __memset_cc_by4((s),(c),(count)) : \
+ ((count%2==0) ? \
+  __memset_cc_by2((s),(c),(count)) : \
+  __memset_cg((s),(c),(count))))
+
+#define __memset_gc(s,c,count) \
+((count%4==0) ? \
+ __memset_gc_by4((s),(c),(count)) : \
+ ((count%2==0) ? \
+  __memset_gc_by2((s),(c),(count)) : \
+  __memset_gg((s),(c),(count))))
+
+#define memset(s,c,count) \
+(__builtin_constant_p(c) ? \
+ (__builtin_constant_p(count) ? \
+  __memset_cc((s),(c),(count)) : \
+  __memset_cg((s),(c),(count))) : \
+ (__builtin_constant_p(count) ? \
+  __memset_gc((s),(c),(count)) : \
+  __memset_gg((s),(c),(count))))
+
+extern inline void * __memset_cc_by4(void * s, char c, size_t count)
+{
+/*
+ *register char *tmp = s;
+ */
+  register void *tmp = (void *)s;
+  register int  dummy;
+  __asm__ __volatile__
+    ("\n1:\tmovl %2,(%0)\n\t"
+     "addl $4,%0\n\t"
+     "decl %1\n\t"
+     "jnz 1b"
+     :"=r" (tmp), "=r" (dummy)
+     :"q" (0x01010101UL * (unsigned char) c), "0" (tmp), "1" (count/4)
+     :"memory");
+  return s;
+}
+
+extern inline void * __memset_cc_by2(void * s, char c, size_t count)
+{
+  register void *tmp = (void *)s;
+  register int  dummy;
+  __asm__ __volatile__
+    ("shrl $1,%1\n\t"          /* may be divisible also by 4 */
+     "jz 2f\n"
+     "\n1:\tmovl %2,(%0)\n\t"
+     "addl $4,%0\n\t"
+     "decl %1\n\t"
+     "jnz 1b\n"
+     "2:\tmovw %w2,(%0)"
+     :"=r" (tmp), "=r" (dummy)
+     :"q" (0x01010101UL * (unsigned char) c), "0" (tmp), "1" (count/2)
+     :"memory");
+  return s;
+}
+
+extern inline void * __memset_gc_by4(void * s, char c, size_t count)
+{
+  register void *tmp = (void *)s;
+  register int dummy;
+  __asm__ __volatile__
+    ("movb %b0,%h0\n"
+     "pushw %w0\n\t"
+     "shll $16,%0\n\t"
+     "popw %w0\n"
+     "1:\tmovl %0,(%1)\n\t"
+     "addl $4,%1\n\t"
+     "decl %2\n\t"
+     "jnz 1b\n"
+     :"=q" (c), "=r" (tmp), "=r" (dummy)
+     :"0" ((unsigned) c),  "1"  (tmp), "2" (count/4)
+     :"memory");
+  return s;
+}
+
+extern inline void * __memset_gc_by2(void * s, char c, size_t count)
+{
+  register void *tmp = (void *)s;
+  register int dummy1,dummy2;
+  __asm__ __volatile__
+    ("movb %b0,%h0\n\t"
+     "shrl $1,%2\n\t"          /* may be divisible also by 4 */
+     "jz 2f\n\t"
+     "pushw %w0\n\t"
+     "shll $16,%0\n\t"
+     "popw %w0\n"
+     "1:\tmovl %0,(%1)\n\t"
+     "addl $4,%1\n\t"
+     "decl %2\n\t"
+     "jnz 1b\n"
+     "2:\tmovw %w0,(%1)"
+     :"=q" (dummy1), "=r" (tmp), "=r" (dummy2)
+     :"0" ((unsigned) c),  "1"  (tmp), "2" (count/2)
+     :"memory");
+  return s;
+}
+
+extern inline void * __memset_cg(void * s, char c, size_t count)
+{
+  register void *tmp = (void *)s;
+  __asm__ __volatile__
+    ("shrl $1,%%ecx\n\t"
+     "rep\n\t"
+     "stosw\n\t"
+     "jnc 1f\n\t"
+     "movb %%al,(%%edi)\n"
+     "1:"
+     : /* no output */
+     :"c" (count),"D" (tmp), "a" (0x0101U * (unsigned char) c)
+     :"cx","di","memory");
+  return s;
+}
+
+extern inline void * __memset_gg(void * s,char c,size_t count)
+{
+  register void *tmp = (void *)s;
+  __asm__ __volatile__
+    ("movb %%al,%%ah\n\t"
+     "shrl $1,%%ecx\n\t"
+     "rep\n\t"
+     "stosw\n\t"
+     "jnc 1f\n\t"
+     "movb %%al,(%%edi)\n"
+     "1:"
+     : /* no output */
+     :"c" (count),"D" (tmp), "a" (c)
+     :"cx","di","memory");
+  return s;
+}
+
+#else /* CONFIG_M486 || CONFIG_M586 */
+
 extern inline void * __memset_generic(void * s, char c,size_t count)
 {
 __asm__ __volatile__(
        "cld\n\t"
        "rep\n\t"
@@ -527,10 +939,12 @@

 #define memset(s, c, count) \
 (__builtin_constant_p(c) ? \
  __constant_c_x_memset((s),(0x01010101UL*(unsigned char)c),(count)) : \
  __memset((s),(c),(count)))
+
+#endif /* CONFIG_M486 || CONFIG_M586 */

 /*
  * find the first occurrence of byte 'c', or 1 past the area if none
  */
 extern inline void * memscan(void * addr, int c, size_t size)
diff -u5 linux/include/asm-i386/byteorder.h.oldd linux/include/asm-i386/byteorder.h
--- linux/include/asm-i386/byteorder.h.oldd     Tue May  2 18:54:54 1995
+++ linux/include/asm-i386/byteorder.h  Tue May  2 19:23:23 1995
@@ -25,13 +25,18 @@
 extern unsigned short int      __constant_ntohs(unsigned short int);

 extern __inline__ unsigned long int
 __ntohl(unsigned long int x)
 {
+#if (defined(CONFIG_M486) || defined(CONFIG_M586)) && \
+     defined(CONFIG_ASM_OPTIMIZE)
+       __asm__("bswap %0"
+#else /* (CONFIG_M486 || CONFIG_M586) && CONFIG_ASM_OPTIMIZE */
        __asm__("xchgb %b0,%h0\n\t"   /* swap lower bytes     */
                "rorl $16,%0\n\t"     /* swap words           */
                "xchgb %b0,%h0"               /* swap higher bytes    */
+#endif /* (CONFIG_M486 || CONFIG_M586) && CONFIG_ASM_OPTIMIZE */
                :"=q" (x)
                : "0" (x));
        return x;
 }

--
 Werner Fink            Institut f"ur Theoretische und Angewandte Physik
                        Universit"at Stuttgart
                        Pfaffenwaldring 57/VI,  70550 Stuttgart, Germany
 E-mail: Werner.F...@itap.physik.uni-stuttgart.de