Kernel speed and inlined string.h routines

Kernel speed and inlined string.h routines

Post by Werner Fi » Wed, 03 May 1995 04:00:00



I've found in:

    sunsite.unc.edu/pub/Linux/Kernel/patches/misc/lx10inline_v2.tgz

inlined assembler for library functions (memcpy etc.) for linux on
i486 CPU's.  Including the functions (made for kernel version 1.0.9)
into kernel 1.2.5 really increase the kernel speed on my machine.

Note: I'm _NOT_ the author of the assembler functions. For more
information please read the README from the authors Alberto Vignani and
Davide Parodi in the original source lx10inline_v2.tgz!

The enclosed patch modifies the files
  linux/arch/i386/config.in
  linux/lib/string.c
  linux/include/asm-i386/string.h
against 1.2.6+.

After patching run `make config'
with enabled 486-specific optimizations _and_ enabled assembler optimized
library.

Werner

PS: Todo: Including more functions like bcopy;
          Better optimizing of the C-code part of memcpy
          and memset in linux/include/asm-i386/string.h
          (like the i386 versions written by Linus)
------------------------------------------------------------------------
diff -u5 linux/arch/i386/config.in.oldd linux/arch/i386/config.in
--- linux/arch/i386/config.in.oldd      Sun Apr 30 21:25:11 1995
+++ linux/arch/i386/config.in   Sun Apr 30 21:23:29 1995
@@ -33,10 +33,14 @@
 #bool 'Use -mpentium flag for Pentium-specific optimizations' CONFIG_M586 n
 #if [ "$CONFIG_M586" = "n" ]; then
 bool 'Use -m486 flag for 486-specific optimizations' CONFIG_M486 y
 #fi

+comment 'Using of assembler optimized library (experimental)'
+comment 'combined with 486-specific optimizations only on 486 or 586'
+bool 'Assembler optimized library' CONFIG_ASM_STRING_H y
+
 comment 'Loadable module support'
 bool 'Set version information on all symbols for modules' CONFIG_MODVERSIONS n

 if [ "$CONFIG_NET" = "y" ]; then
 comment 'Networking options'
diff -u5 linux/lib/string.c.oldd linux/lib/string.c
--- linux/lib/string.c.oldd     Wed Apr 19 11:13:35 1995
+++ linux/lib/string.c  Wed Apr 19 11:54:19 1995
@@ -9,10 +9,18 @@
  * as inline code in <asm-xx/string.h>
  *
  * These are buggy as well..
  */

+#if defined(CONFIG_ASM_STRING_H)
+
+#include <linux/string.h>
+
+char * ___strtok = NULL;
+
+#else /* CONFIG_ASM_STRING_H */
+
 #include <linux/types.h>

 char * ___strtok = NULL;

 char * strcpy(char * dest,const char *src)
@@ -239,5 +247,7 @@
                p++;
                size--;
        }
        return (void *) p;
 }
+
+#endif /* CONFIG_ASM_STRING_H */
diff -u5 linux/include/asm-i386/string.h.oldd linux/include/asm-i386/string.h
--- linux/include/asm-i386/string.h.oldd        Mon Jan  9 05:33:23 1995
+++ linux/include/asm-i386/string.h     Tue May  2 12:02:21 1995
@@ -8,28 +8,72 @@
  * see especially strtok,strstr,str[c]spn. They should work, but are not
  * very easy to understand. Everything is done entirely within the register
  * set, making the functions fast and clean. String instructions have been
  * used through-out, making for "slightly" unclear code :-)
  *
- *             Copyright (C) 1991, 1992 Linus Torvalds
+ *             Copyright (C) 1991,1992,1993,1994 Linus Torvalds
+ *             Revised and optimized for i486/pentium
+ *             1994/03/15 by Alberto Vignani/Davide Parodi @crf.it
  */

 extern inline char * strcpy(char * dest,const char *src)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register char *tmp= (char *)dest;
+  register char dummy;
+  __asm__ __volatile__
+    ("\n1:\t"
+     "movb (%0),%2\n\t"
+     "incl %0\n\t"
+     "movb %2,(%1)\n\t"
+     "incl %1\n\t"
+     "testb %2,%2\n\t"
+     "jne 1b"
+     :"=r" (src), "=r" (tmp), "=q" (dummy)
+     :"0" (src), "1" (tmp)
+     :"memory");
+  return dest;
+#else /* CONFIG_M486 || CONFIG_M586 */
 __asm__ __volatile__(
        "cld\n"
        "1:\tlodsb\n\t"
        "stosb\n\t"
        "testb %%al,%%al\n\t"
        "jne 1b"
        : /* no output */
        :"S" (src),"D" (dest):"si","di","ax","memory");
 return dest;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline char * strncpy(char * dest,const char *src,size_t count)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register char *tmp= (char *)dest;
+  register char dummy;
+  if (count) {
+  __asm__ __volatile__ (
+        "\n1:\t"
+        "movb (%0),%2\n\t"
+        "incl %0\n\t"
+        "movb %2,(%1)\n\t"
+        "incl %1\n\t"
+        "decl %3\n\t"
+        "je 3f\n\t"
+        "testb %2,%2\n\t"
+        "jne 1b\n\t"
+        "2:\tmovb %2,(%1)\n\t"
+        "incl %1\n\t"
+        "decl %3\n\t"
+        "jne 2b\n\t"
+        "3:"
+        :"=r" (src), "=r" (tmp), "=q" (dummy), "=r" (count)
+        :"0" (src), "1" (tmp), "3" (count)
+        :"memory");
+  }
+  return dest;
+#else /* CONFIG_M486 || CONFIG_M586 */
 __asm__ __volatile__(
        "cld\n"
        "1:\tdecl %2\n\t"
        "js 2f\n\t"
        "lodsb\n\t"
@@ -40,14 +84,33 @@
        "stosb\n"
        "2:"
        : /* no output */
        :"S" (src),"D" (dest),"c" (count):"si","di","ax","cx","memory");
 return dest;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline char * strcat(char * dest,const char * src)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register char *tmp = (char *)(dest-1);
+  register char dummy;
+  __asm__ __volatile__
+    ("\n1:\tincl %1\n\t"
+     "cmpb $0,(%1)\n\t"
+     "jne 1b\n"
+     "2:\tmovb (%2),%b0\n\t"
+     "incl %2\n\t"
+     "movb %b0,(%1)\n\t"
+     "incl %1\n\t"
+     "testb %b0,%b0\n\t"
+     "jne 2b\n"
+     :"=q" (dummy), "=r" (tmp), "=r" (src)
+     :"1"  (tmp), "2"  (src)
+     :"memory");
+  return dest;
+#else /* CONFIG_M486 || CONFIG_M586 */
 __asm__ __volatile__(
        "cld\n\t"
        "repne\n\t"
        "scasb\n\t"
        "decl %1\n"
@@ -56,14 +119,37 @@
        "testb %%al,%%al\n\t"
        "jne 1b"
        : /* no output */
        :"S" (src),"D" (dest),"a" (0),"c" (0xffffffff):"si","di","ax","cx");
 return dest;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline char * strncat(char * dest,const char * src,size_t count)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register char *tmp = (char *)(dest-1);
+  register char dummy;
+  __asm__ __volatile__
+    ("\n1:\tincl %1\n\t"
+     "cmpb $0,(%1)\n\t"
+     "jne 1b\n"
+     "2:\tdecl %3\n\t"
+     "js 3f\n\t"
+     "movb (%2),%b0\n\t"
+     "incl %2\n\t"
+     "movb %b0,(%1)\n\t"
+     "incl %1\n\t"
+     "testb %b0,%b0\n\t"
+     "jne 2b\n"
+     "3:\txorl %0,%0\n\t"
+     "movb %b0,(%1)\n\t"
+     :"=q" (dummy), "=r" (tmp), "=r" (src), "=r" (count)
+     :"1"  (tmp), "2"  (src), "3"  (count)
+     :"memory");
+  return dest;
+#else /* CONFIG_M486 || CONFIG_M586 */
 __asm__ __volatile__(
        "cld\n\t"
        "repne\n\t"
        "scasb\n\t"
        "decl %1\n\t"
@@ -78,14 +164,36 @@
        "stosb"
        : /* no output */
        :"S" (src),"D" (dest),"a" (0),"c" (0xffffffff),"g" (count)
        :"si","di","ax","cx","memory");
 return dest;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline int strcmp(const char * cs,const char * ct)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register int __res;
+  __asm__ __volatile__
+    ("\n1:\tmovb (%1),%b0\n\t"
+     "incl %1\n\t"
+     "cmpb %b0,(%2)\n\t"
+     "jne 2f\n\t"
+     "incl %2\n\t"
+     "testb %b0,%b0\n\t"
+     "jne 1b\n\t"
+     "xorl %0,%0\n\t"
+     "jmp 3f\n"
+     "2:\tmovl $1,%0\n\t"
+     "jb 3f\n\t"
+     "negl %0\n"
+     "3:"
+     :"=q" (__res), "=r" (cs), "=r" (ct)
+     :"1" (cs), "2" (ct)
+     : "memory" );
+  return __res;
+#else /* CONFIG_M486 || CONFIG_M586 */
 register int __res;
 __asm__ __volatile__(
        "cld\n"
        "1:\tlodsb\n\t"
        "scasb\n\t"
@@ -97,14 +205,37 @@
        "2:\tsbbl %%eax,%%eax\n\t"
        "orb $1,%%eax\n"
        "3:"
        :"=a" (__res):"S" (cs),"D" (ct):"si","di");
 return __res;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline int strncmp(const char * cs,const char * ct,size_t count)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register int __res;
+  __asm__ __volatile__
+    ("\n1:\tdecl %3\n\t"
+     "js 2f\n\t"
+     "movb (%1),%b0\n\t"
+     "incl %1\n\t"
+     "cmpb %b0,(%2)\n\t"
+     "jne 3f\n\t"
+     "incl %2\n\t"
+     "testb %b0,%b0\n\t"
+     "jne 1b\n"
+     "2:\txorl %0,%0\n\t"
+     "jmp 4f\n"
+     "3:\tmovl $1,%0\n\t"
+     "jb 4f\n\t"
+     "negl %0\n"
+     "4:"
+     :"=q" (__res), "=r" (cs), "=r" (ct), "=r" (count)
+     :"1"  (cs), "2"  (ct),  "3" (count));
+  return __res;
+#else /* CONFIG_M486 || CONFIG_M586 */
 register int __res;
 __asm__ __volatile__(
        "cld\n"
        "1:\tdecl %3\n\t"
        "js 2f\n\t"
@@ -118,14 +249,31 @@
        "3:\tsbbl %%eax,%%eax\n\t"
        "orb $1,%%al\n"
        "4:"
        :"=a" (__res):"S" (cs),"D" (ct),"c" (count):"si","di","cx");
 return __res;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline char * strchr(const char * s, int c)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register char * __res;
+  __asm__ __volatile__
+    ("movb %%al,%%ah\n"
+     "1:\tmovb (%1),%%al\n\t"
+     "cmpb %%ah,%%al\n\t"
+     "je 2f\n\t"
+     "incl %1\n\t"
+     "testb %%al,%%al\n\t"
+     "jne 1b\n\t"
+     "xorl %1,%1\n"
+     "2:\tmovl %1,%0\n\t"
+     :"=a" (__res), "=r" (s)
+     :"0" (c),      "1"  (s));
+  return __res;
+#else /* CONFIG_M486 || CONFIG_M586 */
 register char * __res;
 __asm__ __volatile__(
        "cld\n\t"
        "movb %%al,%%ah\n"
        "1:\tlodsb\n\t"
@@ -136,14 +284,30 @@
        "movl $1,%1\n"
        "2:\tmovl %1,%0\n\t"
        "decl %0"
        :"=a" (__res):"S" (s),"0" (c):"si");
 return __res;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline char * strrchr(const char * s, int c)
 {
+#if defined(CONFIG_M486) || defined(CONFIG_M586)
+  register char * __res;
+  __asm__ __volatile__
+    ("movb %b2,%h2\n"
+     "1:\tmovb (%1),%b2\n\t"
+     "cmpb %h2,%b2\n\t"
+     "jne 2f\n\t"
+     "movl %1,%0\n"
+     "2:\tincl %1\n\t"
+     "testb %b2,%h2\n\t"
+     "jne 1b"
+     :"=r" (__res), "=r" (s), "=q" (c)
+     :"0" (0), "1"  (s), "2" (c));
+  return __res;
+#else /* CONFIG_M486 || CONFIG_M586 */
 register char * __res;
 __asm__ __volatile__(
        "cld\n\t"
        "movb %%al,%%ah\n"
        "1:\tlodsb\n\t"
@@ -152,10 +316,11 @@
        "leal -1(%%esi),%0\n"
        "2:\ttestb %%al,%%al\n\t"
        "jne 1b"
        :"=d" (__res):"0" (0),"S" (s),"a" (c):"ax","si");
 return __res;
+#endif /* CONFIG_M486 || CONFIG_M586 */
 }

 extern inline size_t strspn(const char *
...

read more »

 
 
 

Kernel speed and inlined string.h routines

Post by Mr S J Greenaw » Thu, 04 May 1995 04:00:00


: I'm quite pleased to know that my patches still work with 1.2.x !!
: Actually I discarded them long time ago, maybe when Linus reorganized
: the directory structure.
: Are you sure the speed increase is really significant? I measured
: only a few percent difference, and Linus was *not* impressed enough.
: Do you have a Pentium or a 486? And which one?
: I'm now considering reviving those patches if they have still some
: value for 1.2.x.

I thought gcc inlined these sort of functions anyway?

Simon Greenaway

 
 
 

Kernel speed and inlined string.h routines

Post by Werner Fi » Thu, 04 May 1995 04:00:00




> : I'm quite pleased to know that my patches still work with 1.2.x !!
> : Actually I discarded them long time ago, maybe when Linus reorganized
> : the directory structure.
> : Are you sure the speed increase is really significant? I measured
> : only a few percent difference, and Linus was *not* impressed enough.
> : Do you have a Pentium or a 486? And which one?
> : I'm now considering reviving those patches if they have still some
> : value for 1.2.x.

> I thought gcc inlined these sort of functions anyway?

> Simon Greenaway

No, not automaticly. These library functions are
_defined_ in linux/include/asm-i386/string.h as inlined.
Only these functions were not used in 1.2.x, have
a look in linux/lib/string.c!

  Werner
--
 Werner Fink            Institut f"ur Theoretische und Angewandte Physik
                        Universit"at Stuttgart
                        Pfaffenwaldring 57/VI,  70550 Stuttgart, Germany

 
 
 

Kernel speed and inlined string.h routines

Post by Erik Bla » Wed, 10 May 1995 04:00:00


: Are you sure the speed increase is really significant? I measured
I am measuring a performance loss with the old inline-patches and 1.2.8.
Here a the results with the bench4 test:

Without inline:

FILE WRITE (4MB)
    0.030u 3.360s 0:10.95 30.9% 0+0k 0+0io 31pf+0w
    0.020u 3.200s 0:12.27 26.2% 0+0k 0+0io 31pf+0w
    0.040u 3.090s 0:12.26 25.5% 0+0k 0+0io 31pf+0w
    0.020u 3.320s 0:12.29 27.1% 0+0k 0+0io 31pf+0w
FILE READ (4MB)
    0.020u 1.250s 0:01.27 100.0% 0+0k 0+0io 26pf+0w
    0.030u 1.170s 0:01.19 100.8% 0+0k 0+0io 26pf+0w
    0.000u 1.200s 0:01.20 100.0% 0+0k 0+0io 26pf+0w
    0.070u 1.130s 0:01.20 100.0% 0+0k 0+0io 26pf+0w
PIPE:           0.330u 2.230s 0:08.41 30.4% 0+0k 0+0io 26pf+0w
PIPE:           0.320u 2.300s 0:08.42 31.1% 0+0k 0+0io 26pf+0w
PIPE:           0.360u 2.240s 0:08.68 29.9% 0+0k 0+0io 26pf+0w
PIPE:           0.330u 2.250s 0:08.61 29.9% 0+0k 0+0io 26pf+0w
SWITCH2:        20000 full duplex iterations in 2845 mS (7029/sec)
SWITCH2:        20000 full duplex iterations in 3025 mS (6611/sec)
SWITCH2:        20000 full duplex iterations in 2868 mS (6973/sec)
SWITCH2:        20000 full duplex iterations in 2877 mS (6951/sec)
SWITCH3:        20000 full duplex iterations in 6013 mS (3326/sec)
SWITCH3:        20000 full duplex iterations in 5973 mS (3348/sec)
SWITCH3:        20000 full duplex iterations in 5929 mS (3373/sec)
SWITCH3:        20000 full duplex iterations in 6015 mS (3325/sec)
SWITCH3p:       20000 full duplex iterations in 5365 mS (3727/sec)
SWITCH3p:       20000 full duplex iterations in 5281 mS (3787/sec)
SWITCH3p:       20000 full duplex iterations in 5363 mS (3729/sec)
SWITCH3p:       20000 full duplex iterations in 5351 mS (3737/sec)

And with inline:

FILE WRITE (4MB)
    0.030u 2.540s 0:11.31 22.7% 0+0k 0+0io 31pf+0w
    0.030u 2.700s 0:12.15 22.4% 0+0k 0+0io 31pf+0w
    0.050u 2.610s 0:12.34 21.5% 0+0k 0+0io 31pf+0w
    0.050u 2.620s 0:12.30 21.7% 0+0k 0+0io 31pf+0w
FILE READ (4MB)
    0.000u 1.250s 0:01.25 100.0% 0+0k 0+0io 26pf+0w
    0.010u 1.190s 0:01.19 100.8% 0+0k 0+0io 26pf+0w
    0.020u 1.190s 0:01.20 100.8% 0+0k 0+0io 26pf+0w
    0.020u 1.180s 0:01.20 100.0% 0+0k 0+0io 26pf+0w
PIPE:           0.430u 2.370s 0:09.34 29.9% 0+0k 0+0io 26pf+0w
PIPE:           0.400u 2.170s 0:09.05 28.3% 0+0k 0+0io 26pf+0w
PIPE:           0.410u 2.270s 0:09.43 28.4% 0+0k 0+0io 26pf+0w
PIPE:           0.470u 2.380s 0:09.40 30.3% 0+0k 0+0io 26pf+0w
SWITCH2:        20000 full duplex iterations in 3146 mS (6357/sec)
SWITCH2:        20000 full duplex iterations in 3135 mS (6379/sec)
SWITCH2:        20000 full duplex iterations in 3090 mS (6472/sec)
SWITCH2:        20000 full duplex iterations in 3267 mS (6121/sec)
SWITCH3:        20000 full duplex iterations in 6390 mS (3129/sec)
SWITCH3:        20000 full duplex iterations in 6303 mS (3173/sec)
SWITCH3:        20000 full duplex iterations in 6226 mS (3212/sec)
SWITCH3:        20000 full duplex iterations in 6317 mS (3166/sec)
SWITCH3p:       20000 full duplex iterations in 5475 mS (3652/sec)
SWITCH3p:       20000 full duplex iterations in 5551 mS (3602/sec)
SWITCH3p:       20000 full duplex iterations in 5565 mS (3593/sec)
SWITCH3p:       20000 full duplex iterations in 5506 mS (3632/sec)

But this may be because of my bad adaption of the old patch....
The author should post an actual version for 1.2 :-)

Erik

--


Ich habe Dinge gesehen, die ihr Menschen niemals glauben wuerdet. Gigantische
Schiffe, die brannten draussen vor der Schulter des Orion.Und ich habe C-Beams  
gesehen - glitzernd im Dunkeln nahe dem Tannhaeuser Tor. All diese Momente
werden verloren sein in der Zeit...so wie Traenen im Regen.      
                                Zeit zu sterben...               [Roy Batty]