kmalloc_percpu -- 1 of 2

kmalloc_percpu -- 1 of 2

Post by Ravikiran G Thirumala » Thu, 05 Dec 2002 14:20:05



Here's a 2.5.50 version of kmalloc_percpu originally submitted by Dipankar.  
This one incorporates Rusty's suggestions to rearrange code and sync
up with its static counterpart. This version needs exposure of malloc_sizes
in order to move the interfaces to percpu.c from slab.c (kmalloc_percpu and
kfree_percpu donot have anything to do with the slab allocator itself).
First of the two patches is to expose the malloc_sizes and the second
one is the actual allocator.  I'll follow this up with patchsets to enable
networking mibs use kmalloc_percpu. We'll have to use kmalloc_percpu
for mibs since DEFINE_PER_CPU won't work with modules (and ipv6 stuff
can be compiled in as modules)

Following is the 1 of 2 patches.

D: Name: slabchange-2.5.50-1.patch
D: Description: Exposes malloc_sizes for kmalloc_percpu
D: Author: Ravikiran Thirumalai

 include/linux/slab.h |   13 +++++++++++++
 mm/slab.c            |    9 +--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff -ruN linux-2.5.50/include/linux/slab.h kmalloc_percpu-2.5.50/include/linux/slab.h
--- linux-2.5.50/include/linux/slab.h   Thu Nov 28 04:06:23 2002

 extern kmem_cache_t    *sigact_cachep;
 extern kmem_cache_t    *bio_cachep;

+/*
+ * Size description struct for general caches.
+ * This had to be exposed for kmalloc_percpu.
+ */
+
+struct cache_sizes {
+       size_t           cs_size;
+       kmem_cache_t    *cs_cachep;
+       kmem_cache_t    *cs_dmacachep;
+};
+
+extern struct cache_sizes malloc_sizes[];
+
 #endif /* __KERNEL__ */

 #endif /* _LINUX_SLAB_H */
diff -ruN linux-2.5.50/mm/slab.c kmalloc_percpu-2.5.50/mm/slab.c
--- linux-2.5.50/mm/slab.c      Thu Nov 28 04:06:17 2002

 #define        SET_PAGE_SLAB(pg,x)   ((pg)->list.prev = (struct list_head *)(x))
 #define        GET_PAGE_SLAB(pg)     ((struct slab *)(pg)->list.prev)

-/* Size description struct for general caches. */
-struct cache_sizes {
-       size_t           cs_size;
-       kmem_cache_t    *cs_cachep;
-       kmem_cache_t    *cs_dmacachep;
-};
-
 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
-static struct cache_sizes malloc_sizes[] = {
+struct cache_sizes malloc_sizes[] = {
 #if PAGE_SIZE == 4096
        {    32,        NULL, NULL},
 #endif
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

kmalloc_percpu -- 1 of 2

Post by Ravikiran G Thirumala » Thu, 05 Dec 2002 14:20:08


Here's a 2.5.50 version of kmalloc_percpu originally submitted by Dipankar.
This one incorporates Rusty's suggestions to rearrange code and sync
up with its static counterpart. This version needs exposure of malloc_sizes
in order to move the interfaces to percpu.c from slab.c (kmalloc_percpu and
kfree_percpu donot have anything to do with the slab allocator itself).
First of the two patches is to expose the malloc_sizes and the second
one is the actual allocator.  I'll follow this up with patchsets to enable
networking mibs use kmalloc_percpu. We'll have to use kmalloc_percpu
for mibs since DEFINE_PER_CPU won't work with modules (and ipv6 stuff
can be compiled in as modules)

Following is the 2 of 2 patches.

D: Name: kmalloc_percpu-2.5.50-1.patch
D: Description: Dynamic per-cpu kernel memory allocator
D: Author: Dipankar Sarma & Ravikiran Thirumalai

 include/linux/percpu.h |   39 ++++
 init/main.c            |    1
 kernel/Makefile        |    4
 kernel/ksyms.c         |    4
 kernel/percpu.c        |  452 +++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 498 insertions(+), 2 deletions(-)

diff -ruN linux-2.5.50/include/linux/percpu.h kmalloc_percpu-2.5.50/include/linux/percpu.h
--- linux-2.5.50/include/linux/percpu.h Thu Nov 28 04:06:19 2002
+++ kmalloc_percpu-2.5.50/include/linux/percpu.h        Sun Dec  1 11:54:49 2002
@@ -1,10 +1,49 @@
 #ifndef __LINUX_PERCPU_H
 #define __LINUX_PERCPU_H
 #include <linux/spinlock.h> /* For preempt_disable() */
+#include <linux/slab.h> /* For kmalloc_percpu() */
 #include <asm/percpu.h>

 /* Must be an lvalue. */
 #define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()

+#ifdef CONFIG_SMP
+
+struct percpu_data {
+       void *ptrs[NR_CPUS];
+       void *blkp;
+};
+
+/* Use this with kmalloc_percpu */
+#define per_cpu_ptr(ptr, cpu)                   \
+({                                              \
+        struct percpu_data *__p = (struct percpu_data *)~(unsigned long)(ptr); \
+        (__typeof__(ptr))__p->ptrs[(cpu)];  \
+})
+
+extern void *kmalloc_percpu(size_t size, int flags);
+extern void kfree_percpu(const void *);
+extern void kmalloc_percpu_init(void);
+
+#else /* CONFIG_SMP */
+
+#define per_cpu_ptr(ptr, cpu) (ptr)
+
+static inline void *kmalloc_percpu(size_t size, int flags)
+{
+       return(kmalloc(size, flags));
+}
+static inline void kfree_percpu(const void *ptr)
+{      
+       kfree(ptr);
+}
+static inline void kmalloc_percpu_init(void) { }
+
+#endif /* CONFIG_SMP */
+
+/* Use these with kmalloc_percpu */
+#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
+#define put_cpu_ptr(ptr) put_cpu()
+
 #endif /* __LINUX_PERCPU_H */
diff -ruN linux-2.5.50/init/main.c kmalloc_percpu-2.5.50/init/main.c
--- linux-2.5.50/init/main.c    Thu Nov 28 04:05:51 2002
+++ kmalloc_percpu-2.5.50/init/main.c   Sun Dec  1 11:54:49 2002
@@ -423,6 +423,7 @@
        page_address_init();
        mem_init();
        kmem_cache_sizes_init();
+       kmalloc_percpu_init();
        pidhash_init();
        pgtable_cache_init();
        pte_chain_init();
diff -ruN linux-2.5.50/kernel/Makefile kmalloc_percpu-2.5.50/kernel/Makefile
--- linux-2.5.50/kernel/Makefile        Thu Nov 28 04:05:51 2002
+++ kmalloc_percpu-2.5.50/kernel/Makefile       Sun Dec  1 11:54:49 2002
@@ -4,7 +4,7 @@

 export-objs = signal.o sys.o kmod.o workqueue.o ksyms.o pm.o exec_domain.o \
                printk.o platform.o suspend.o dma.o module.o cpufreq.o \
-               profile.o rcupdate.o intermodule.o
+               profile.o rcupdate.o intermodule.o percpu.o

 obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
            exit.o itimer.o time.o softirq.o resource.o \
@@ -13,7 +13,7 @@
            rcupdate.o intermodule.o extable.o

 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_SMP) += cpu.o
+obj-$(CONFIG_SMP) += cpu.o percpu.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += ksyms.o module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
diff -ruN linux-2.5.50/kernel/ksyms.c kmalloc_percpu-2.5.50/kernel/ksyms.c
--- linux-2.5.50/kernel/ksyms.c Thu Nov 28 04:05:47 2002
+++ kmalloc_percpu-2.5.50/kernel/ksyms.c        Sun Dec  1 11:54:49 2002
@@ -97,6 +97,10 @@
 EXPORT_SYMBOL(remove_shrinker);
 EXPORT_SYMBOL(kmalloc);
 EXPORT_SYMBOL(kfree);
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(kmalloc_percpu);
+EXPORT_SYMBOL(kfree_percpu);
+#endif
 EXPORT_SYMBOL(vfree);
 EXPORT_SYMBOL(__vmalloc);
 EXPORT_SYMBOL(vmalloc);
diff -ruN linux-2.5.50/kernel/percpu.c kmalloc_percpu-2.5.50/kernel/percpu.c
--- linux-2.5.50/kernel/percpu.c        Thu Jan  1 05:30:00 1970
+++ kmalloc_percpu-2.5.50/kernel/percpu.c       Sun Dec  1 11:54:49 2002
@@ -0,0 +1,452 @@
+/*
+ * Dynamic Per-CPU Data Allocator.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) IBM Corporation, 2002
+ *
+ * Author:              Dipankar Sarma <dipan...@in.ibm.com>
+ *                     Ravikiran G. Thirumalai <ki...@in.ibm.com>
+ *
+ */
+
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/cache.h>
+
+struct percpu_data_blklist {
+       struct list_head blks;
+       struct list_head *firstnotfull;
+       spinlock_t lock;
+       size_t objsize;
+       size_t blksize;
+       kmem_cache_t *cachep;
+       char *cachename;
+};
+
+struct percpu_data_blk {
+       struct list_head linkage;
+       void *blkaddr[NR_CPUS];
+       unsigned int usecount;
+       int *freearr;
+       int freehead;
+       struct percpu_data_blklist *blklist;
+};
+
+static struct percpu_data_blklist data_blklist[] = {
+       {
+        .blks = LIST_HEAD_INIT(data_blklist[0].blks),
+        .firstnotfull = &data_blklist[0].blks,
+        .lock = SPIN_LOCK_UNLOCKED,
+        .objsize = 4,
+        .blksize = ALIGN(4, SMP_CACHE_BYTES),
+        .cachep = NULL,
+        .cachename = "percpu_data_4"},
+       {
+        .blks = LIST_HEAD_INIT(data_blklist[1].blks),
+        .firstnotfull = &data_blklist[1].blks,
+        .lock = SPIN_LOCK_UNLOCKED,
+        .objsize = 8,
+        .blksize = ALIGN(8, SMP_CACHE_BYTES),
+        .cachep = NULL,
+        .cachename = "percpu_data_8"},
+       {
+        .blks = LIST_HEAD_INIT(data_blklist[2].blks),
+        .firstnotfull = &data_blklist[2].blks,
+        .lock = SPIN_LOCK_UNLOCKED,
+        .objsize = 16,
+        .blksize = ALIGN(16, SMP_CACHE_BYTES),
+        .cachep = NULL,
+        .cachename = "percpu_data_16"},
+#if PAGE_SIZE != 4096
+       {
+        .blks = LIST_HEAD_INIT(data_blklist[3].blks),
+        .firstnotfull = &data_blklist[3].blks,
+        .lock = SPIN_LOCK_UNLOCKED,
+        .objsize = 32,
+        .blksize = ALIGN(32, SMP_CACHE_BYTES),
+        .cachep = NULL,
+        .cachename = "percpu_data_32"}
+#endif
+};
+
+static int data_blklist_count =
+    sizeof (data_blklist) / sizeof (struct percpu_data_blklist);
+
+/*
+ * Allocate a block descriptor structure and initialize it.  
+ * Returns the address of the block descriptor or NULL on failure.
+ */
+static struct percpu_data_blk *
+percpu_data_blk_alloc(struct percpu_data_blklist *blklist, int flags)
+{
+       struct percpu_data_blk *blkp;
+       int i;
+       int count;
+
+       if (!(blkp = kmalloc(sizeof (struct percpu_data_blk), flags)))
+               goto out1;
+       INIT_LIST_HEAD(&blkp->linkage);
+       blkp->usecount = 0;
+       count = blklist->blksize / blklist->objsize;
+       blkp->freearr = kmalloc(count, flags);
+       if (!blkp->freearr)
+               goto out;
+       blkp->freehead = 0;
+       for (i = 0; i < count; i++)
+               blkp->freearr[i] = i + 1;
+       blkp->freearr[i - 1] = -1;   /* Marks the end of the array */
+       blkp->blklist = blklist;
+       return blkp;
+out:
+       kfree(blkp);
+out1:
+       return NULL;
+}
+
+/*
+ * Frees the block descriptor structure
+ */
+static void
+percpu_data_blk_free(struct percpu_data_blk *blkp)
+{
+       kfree(blkp);
+}
+
+/*
+ * Add a block to the percpu data object memory pool.
+ * Returns 0 on failure and 1 on success
+ */
+static int
+percpu_data_mem_grow(struct percpu_data_blklist *blklist, int flags)
+{
+       struct percpu_data_blk *blkp;
+       unsigned long save_flags;
+       int i;
+
+       if (!(blkp = percpu_data_blk_alloc(blklist, flags)))
+               goto out;
+
+       for (i = 0; i < NR_CPUS; i++) {
+               if (!cpu_possible(i))
+                       continue;
+               blkp->blkaddr[i] = kmem_cache_alloc(blklist->cachep, flags);
+               if (!(blkp->blkaddr[i]))
+                       goto out1;
+               memset(blkp->blkaddr[i], 0, blklist->blksize);
+       }
+
+       /*
+        * Now that we have the block successfully allocated
+        * and instantiated..  add it.....
+        */
+       spin_lock_irqsave(&blklist->lock, save_flags);
+       list_add_tail(&blkp->linkage, &blklist->blks);
+       if (blklist->firstnotfull == &blklist->blks)
+               blklist->firstnotfull = &blkp->linkage;
+       spin_unlock_irqrestore(&blklist->lock, save_flags);
+       return 1;
+
+out1:
+       i--;
+       for (; i >= 0; i--) {
+               if (!cpu_possible(i))
+                       continue;
+               kmem_cache_free(blklist->cachep, blkp->blkaddr[i]);
+       }
+       percpu_data_blk_free(blkp);
+out:
+       return 0;
+}
+
+/*
+ * Initialise the main percpu data control structure.
+ */
+static void __init
+percpu_data_blklist_init(struct percpu_data_blklist *blklist)
+{
+       blklist->cachep = kmem_cache_create(blklist->cachename,
+                                           blklist->blksize, 0,
+                                           SLAB_HWCACHE_ALIGN, NULL, NULL);
+       if (!blklist->cachep)
+               BUG();
+}
+
+static struct percpu_data_blklist *
+percpu_data_get_blklist(size_t size, int flags)
+{
+       int i;
+       for (i = 0; i < data_blklist_count; i++) {
+               if (size > data_blklist[i].objsize)
+                       continue;
+               return &data_blklist[i];
+       }
+       return NULL;
+}
+
+/*
+ * Initialize the percpu_data subsystem.
+ */
+void __init
+kmalloc_percpu_init(void)
+{
+       int i;
+       for (i = 0; i < data_blklist_count; i++) {
+      
...

read more »

 
 
 

kmalloc_percpu -- 1 of 2

Post by Andrew Morto » Thu, 05 Dec 2002 21:40:10



> Here's a 2.5.50 version of kmalloc_percpu originally submitted by Dipankar.
> +/* Use these with kmalloc_percpu */
> +#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
> +#define put_cpu_ptr(ptr) put_cpu()

These names sound very much like get_cpu_var() and put_cpu_var(),
yet they are using a quite different subsystem.  It would be good
to choose something more distinct.  Not that I can think of anything
at present ;)

The commentary above these functions should clearly state that thou
shalt not sleep between them.

> ...
> --- linux-2.5.50/kernel/Makefile        Thu Nov 28 04:05:51 2002
> +++ kmalloc_percpu-2.5.50/kernel/Makefile       Sun Dec  1 11:54:49 2002

>  export-objs = signal.o sys.o kmod.o workqueue.o ksyms.o pm.o exec_domain.o \
>                 printk.o platform.o suspend.o dma.o module.o cpufreq.o \
> -               profile.o rcupdate.o intermodule.o
> +               profile.o rcupdate.o intermodule.o percpu.o

I suppose so.  It _could_ be in mm/percpu.c

Quote:> ...
> +static int data_blklist_count =
> +    sizeof (data_blklist) / sizeof (struct percpu_data_blklist);

The ARRAY_SIZE macro is suitable here.

Quote:> +static struct percpu_data_blk *
> +percpu_data_blk_alloc(struct percpu_data_blklist *blklist, int flags)
> ...
> +static void
> +percpu_data_blk_free(struct percpu_data_blk *blkp)
> ...
> +static int
> +percpu_data_mem_grow(struct percpu_data_blklist *blklist, int flags)
> ...
> +static void __init
> +percpu_data_blklist_init(struct percpu_data_blklist *blklist)
> ...
> +static struct percpu_data_blklist *
> +percpu_data_get_blklist(size_t size, int flags)
> ...
> +static int
> +__percpu_interlaced_alloc_one(struct percpu_data_blklist *blklist,
> +                             struct percpu_data_blk *blkp)
> ...
> +static int
> +__percpu_interlaced_alloc(struct percpu_data *percpu,
> +                         struct percpu_data_blklist *blklist, int flags)
> ...
> +static int
> +percpu_interlaced_alloc(struct percpu_data *pdata, size_t size, int flags)
> ...
> +static void
> +percpu_data_blk_destroy(struct percpu_data_blklist *blklist,
> +                       struct percpu_data_blk *blkp)
> ...
> +static void
> +__percpu_interlaced_free(struct percpu_data_blklist *blklist,
> +                        struct percpu_data *percpu)
> ...
> +static void
> +percpu_interlaced_free(struct percpu_data *percpu)
> ...

ummm.  What on earth is all that stuff?

- Show quoted text -

Quote:> +void *
> +kmalloc_percpu(size_t size, int flags)
> +{
> +       int i;
> +       struct percpu_data *pdata = kmalloc(sizeof (*pdata), flags);
> +
> +       if (!pdata)
> +               goto out_done;
> +       pdata->blkp = NULL;
> +       if (size <= (malloc_sizes[0].cs_size >> 1)) {
> +               if (!percpu_interlaced_alloc(pdata, size, flags))
> +                       goto out;
> +       } else {
> +               for (i = 0; i < NR_CPUS; i++) {
> +                       if (!cpu_possible(i))
> +                               continue;
> +                       pdata->ptrs[i] = kmalloc(size, flags);
> +                       if (!pdata->ptrs[i])
> +                               goto unwind_oom;
> +               }
> +       }
> +       /* Catch derefs w/o wrappers */
> +       return (void *) (~(unsigned long) pdata);
> +
> +unwind_oom:
> +       while (--i >= 0) {
> +               if (!cpu_possible(i))
> +                       continue;
> +               kfree(pdata->ptrs[i]);
> +       }
> +out:
> +       kfree(pdata);
> +out_done:
> +       return NULL;
> +}

If we were to remove the percpu_interlaced_alloc() leg here, we'd
have a nice, simple per-cpu kmalloc implementation.

Could you please explain what all the other code is there for?
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

kmalloc_percpu -- 1 of 2

Post by Dipankar Sarm » Fri, 06 Dec 2002 05:40:08



> > +/* Use these with kmalloc_percpu */
> > +#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
> > +#define put_cpu_ptr(ptr) put_cpu()

> These names sound very much like get_cpu_var() and put_cpu_var(),
> yet they are using a quite different subsystem.  It would be good
> to choose something more distinct.  Not that I can think of anything
> at present ;)

Well, they are similar, aren't they ? get_cpu_ptr() can just be thought
of as the dynamic twin of get_cpu_var(). So, in that sense it seems ok
to me.

Quote:

> If we were to remove the percpu_interlaced_alloc() leg here, we'd
> have a nice, simple per-cpu kmalloc implementation.

> Could you please explain what all the other code is there for?

The interlaced allocator allows you to save space when kmalloc_percpu()
is used to allocate small objects. That is done by interlacing each
CPU's copy of the objects just like the static per-cpu data area.

Thanks
--

Linux Technology Center, IBM Software Lab, Bangalore, India.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

kmalloc_percpu -- 1 of 2

Post by Andrew Morto » Fri, 06 Dec 2002 06:40:04




> > > +/* Use these with kmalloc_percpu */
> > > +#define get_cpu_ptr(ptr) per_cpu_ptr(ptr, get_cpu())
> > > +#define put_cpu_ptr(ptr) put_cpu()

> > These names sound very much like get_cpu_var() and put_cpu_var(),
> > yet they are using a quite different subsystem.  It would be good
> > to choose something more distinct.  Not that I can think of anything
> > at present ;)

> Well, they are similar, aren't they ? get_cpu_ptr() can just be thought
> of as the dynamic twin of get_cpu_var(). So, in that sense it seems ok
> to me.

hm.  spose so.

Quote:

> > If we were to remove the percpu_interlaced_alloc() leg here, we'd
> > have a nice, simple per-cpu kmalloc implementation.

> > Could you please explain what all the other code is there for?

> The interlaced allocator allows you to save space when kmalloc_percpu()
> is used to allocate small objects. That is done by interlacing each
> CPU's copy of the objects just like the static per-cpu data area.

Where in the kernel is such a large number of 4-, 8- or 16-byte
objects being used?

The slab allocator will support caches right down to 1024 x 4-byte
objects per page.  Why is that not appropriate?

If it is for locality-of-reference between individual objects then
where in the kernel is this required, and are performance measurements
available?  It is very unusual to have objects which are so small,
and a better design would be to obtain the locality of reference
by aggregating the data into an array or structure.

Sorry, but you have what is basically a brand new allocator in
there, and we need a very good reason for including it.  I'd like
to know what that reason is, please.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

kmalloc_percpu -- 1 of 2

Post by William Lee Irwin II » Fri, 06 Dec 2002 06:50:10



> Where in the kernel is such a large number of 4-, 8- or 16-byte
> objects being used?
> The slab allocator will support caches right down to 1024 x 4-byte
> objects per page.  Why is that not appropriate?
> If it is for locality-of-reference between individual objects then
> where in the kernel is this required, and are performance measurements
> available?  It is very unusual to have objects which are so small,
> and a better design would be to obtain the locality of reference
> by aggregating the data into an array or structure.

I will argue not on the frequency of calls but on the preciousness of
space; highmem feels very serious pain when internal fragmentation
of pinned pages occurs (which this is designed to prevent). I don't
have direct experience with this patch/API, but I can say that
fragmentation in ZONE_NORMAL is deadly (witness pagetable occupancy
vs. ZONE_NORMAL consumption, which motivated highpte, despite my
pagetable reclamation smoke blowing).

Bill
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

kmalloc_percpu -- 1 of 2

Post by Dipankar Sarm » Fri, 06 Dec 2002 13:00:08


Hi Andrew,


> Where in the kernel is such a large number of 4-, 8- or 16-byte
> objects being used?

Well, kernel objects may not be that small, but one would expect
the per-cpu parts of the kernel objects to be sometimes small, often down to
a couple of counters counting statistics.

Quote:

> The slab allocator will support caches right down to 1024 x 4-byte
> objects per page.  Why is that not appropriate?

Well, if you allocated 4-byte objects directly from the slab allocator,
you aren't guranteed to *not* share a cache line with another object
modified by a different cpu.

Quote:

> Sorry, but you have what is basically a brand new allocator in
> there, and we need a very good reason for including it.  I'd like
> to know what that reason is, please.

The reason is concern about per-cpu allocation for small per-CPU
parts (typically counters) of objects. If a driver has two counters
counting reads and writes, you don't want to eat up a whole cacheline
for them for each CPU per instance of the device.

Thanks
--

Linux Technology Center, IBM Software Lab, Bangalore, India.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

kmalloc_percpu -- 1 of 2

Post by yodai.. » Fri, 06 Dec 2002 13:30:26



> Hi Andrew,


> > Where in the kernel is such a large number of 4-, 8- or 16-byte
> > objects being used?

> Well, kernel objects may not be that small, but one would expect
> the per-cpu parts of the kernel objects to be sometimes small, often down to
> a couple of counters counting statistics.

Doesn't your allocator increase chances of cache conflict on the same
cpu ?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

kmalloc_percpu -- 1 of 2

Post by William Lee Irwin II » Fri, 06 Dec 2002 13:40:05



>>> Where in the kernel is such a large number of 4-, 8- or 16-byte
>>> objects being used?

> > Well, kernel objects may not be that small, but one would expect
> > the per-cpu parts of the kernel objects to be sometimes small, often down to
> > a couple of counters counting statistics.

> Doesn't your allocator increase chances of cache conflict on the same
> cpu ?

This is so; I'm personally far more concerned about ZONE_NORMAL space
consumption in the cacheline aligned case.

Bill
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

kmalloc_percpu -- 1 of 2

Post by Dipankar Sarm » Fri, 06 Dec 2002 14:40:09



> > Well, kernel objects may not be that small, but one would expect
> > the per-cpu parts of the kernel objects to be sometimes small, often down to
> > a couple of counters counting statistics.

> Doesn't your allocator increase chances of cache conflict on the same
> cpu ?

You mean by increasing the footprint and the chance of eviction ? It
is a compromise. Or you would face NR_CPUS bloat and non-NUMA-node-local
accesses for all CPUs outside the NUMA node where your NR_CPUS array
is located.

Thanks
--

Linux Technology Center, IBM Software Lab, Bangalore, India.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

kmalloc_percpu -- 1 of 2

Post by yodai.. » Fri, 06 Dec 2002 17:20:09



> > Doesn't your allocator increase chances of cache conflict on the same
> > cpu ?

> You mean by increasing the footprint and the chance of eviction ? It
> is a compromise. Or you would face NR_CPUS bloat and non-NUMA-node-local
> accesses for all CPUs outside the NUMA node where your NR_CPUS array
> is located.

What do you base the trade-off decision on?

> Thanks
> --

> Linux Technology Center, IBM Software Lab, Bangalore, India.
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

--
---------------------------------------------------------
Victor Yodaiken
Finite State Machine Labs: The RTLinux Company.
www.fsmlabs.com  www.rtlinux.com
1+ 505 838 9109

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

kmalloc_percpu -- 1 of 2

Post by Andrew Morto » Fri, 06 Dec 2002 22:10:16



> Hi Andrew,


> > Where in the kernel is such a large number of 4-, 8- or 16-byte
> > objects being used?

> Well, kernel objects may not be that small, but one would expect
> the per-cpu parts of the kernel objects to be sometimes small, often down to
> a couple of counters counting statistics.

Sorry, "one would expect" is not sufficient grounds for incorporation of
a new allocator.  As far as I can tell, all the proposed users are in
fact allocating decent-sized aggregates, and that will remain the usual
case.

The code exists, great.  We can pull it in when there is a demonstrated
need for it.  But until that need is shown, this is overdesign.

Quote:

> > The slab allocator will support caches right down to 1024 x 4-byte
> > objects per page.  Why is that not appropriate?

> Well, if you allocated 4-byte objects directly from the slab allocator,
> you aren't guranteed to *not* share a cache line with another object
> modified by a different cpu.

If that's a problem it can be addressed in the slab head arrays - make
sure that they are always filled and emptied in multiple-of-cacheline-sized
units for objects which are smaller than a cacheline.  That benefits all
slab users.

Quote:

> > Sorry, but you have what is basically a brand new allocator in
> > there, and we need a very good reason for including it.  I'd like
> > to know what that reason is, please.

> The reason is concern about per-cpu allocation for small per-CPU
> parts (typically counters) of objects. If a driver has two counters
> counting reads and writes, you don't want to eat up a whole cacheline
> for them for each CPU per instance of the device.

I don't buy it.

- If the driver has two counters per device then the storage is
  infinitesimal.

- If it has multiple counters per device (always the case) then
  the driver will aggregate them anyway.

I am not aware of any situations in which a driver has a large
(or even medium) number of small, discrete counters of this nature.
Sufficiently large to justify a new allocator.

I'd suggest that you drop the new allocator until a compelling
need for it (in real, live 2.5/2.6 code) has been demonstrated.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

kmalloc_percpu -- 1 of 2

Post by Dipankar Sarm » Fri, 06 Dec 2002 23:30:23



> I'd suggest that you drop the new allocator until a compelling
> need for it (in real, live 2.5/2.6 code) has been demonstrated.

Fine with me since atleast one workaround for fragmentation with small
allocations is known. I can't see anything in 2.5 timeframe
requiring small per-cpu allocations.

Would you like me to resubmit a simple kmalloc-only version ?

Thanks
Dipankar
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

kmalloc_percpu -- 1 of 2

Post by Andrew Morto » Sat, 07 Dec 2002 00:20:11




> > I'd suggest that you drop the new allocator until a compelling
> > need for it (in real, live 2.5/2.6 code) has been demonstrated.

> Fine with me since atleast one workaround for fragmentation with small
> allocations is known. I can't see anything in 2.5 timeframe
> requiring small per-cpu allocations.

> Would you like me to resubmit a simple kmalloc-only version ?

I think that would be best.

BTW, looking at the snmp application of this work:

+#define ICMP_INC_STATS_USER_FIELD(offt)                                \
+       (*((unsigned long *) ((void *)                                  \
+                            per_cpu_ptr(icmp_statistics[1],            \
+                                        smp_processor_id())) + offt))++;

This guy is racy on preempt.  Just a little bit.  It is effectively:

        ptr = per_cpu_ptr(...);
        (*ptr)++;

On some architectures, `(*ptr)++' is not atomic wrt interrupts.  The
CPU could be preempted midway through the increment.

Surely it's not an issue for SNMP stats, but for some applications
such as struct page_state, such a permanent off-by-a-little-bit would
be a showstopper.

So some big loud comments which describe the worthiness of get_cpu_ptr(),
and the potential inaccuracy of per_cpu_ptr would be useful.

And as this is the first application of the kmalloc_precpu infrastructure,
it may be best to convert it to use get_cpu_ptr/put_cpu_ptr.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/