[PATCH] struct kernel_stat -> struct cpu_stat[NR_CPUS]

[PATCH] struct kernel_stat -> struct cpu_stat[NR_CPUS]

Post by Zach Brow » Wed, 04 Jul 2001 08:40:09



Currently struct kernel_stat has a few pre cpu arrays.  This creates
cacheline exchange noise as the cpus update their entries in each array.
This patch creates an array of per cpu structs.  The structure is padded
to the length of a cacheline.  The meat of the patch against 2.4.6-pre8
is attached.  The rest of the patch is rather large because it touches
all the architectures' use of ks->irqs[], and can be found at

        http://www.osdlab.org/sw_resources/cpustat/cpustat-2.4.6.pre8-1.diff

These per cpu statistics are reported via a new /proc/cpustat, a quick
tool for processing that output, vmstat-style, can be found near

        http://www.osdlab.org/sw_resources/cpustat/index.shtml

In addition to shuffling structures around, the patch adds the recording
of context scheduling migrations as well as "minor", "major", and
"cow" faults.

I'd really like to hear what people think of the patch.  Unless someone
strongly is dead set against it, I'd like to send it off to Linus and
move on to other things :)  Would arch maintainers rather that I fed
them per-arch patches for their trees?

- z

--- linux-2.4.6-pre8/fs/proc/proc_misc.c.cpustat        Fri Apr 13 20:26:07 2001
+++ linux-2.4.6-pre8/fs/proc/proc_misc.c        Mon Jul  2 15:04:49 2001
@@ -265,32 +265,37 @@
        int i, len;
        extern unsigned long total_forks;
        unsigned long jif = jiffies;
-       unsigned int sum = 0, user = 0, nice = 0, system = 0;
+       unsigned int sum = 0, user = 0, nice = 0, system = 0, ctxt = 0;
        int major, disk;

        for (i = 0 ; i < smp_num_cpus; i++) {
                int cpu = cpu_logical_map(i), j;

-               user += kstat.per_cpu_user[cpu];
-               nice += kstat.per_cpu_nice[cpu];
-               system += kstat.per_cpu_system[cpu];
+               user += cpu_stat[cpu].user;
+               nice += cpu_stat[cpu].nice;
+               system += cpu_stat[cpu].system;
+               ctxt += cpu_stat[cpu].context_swtch;
 #if !defined(CONFIG_ARCH_S390)
                for (j = 0 ; j < NR_IRQS ; j++)
-                       sum += kstat.irqs[cpu][j];
+                       sum += cpu_stat[cpu].irqs[j];
 #endif
        }

        len = sprintf(page, "cpu  %u %u %u %lu\n", user, nice, system,
                      jif * smp_num_cpus - (user + nice + system));
-       for (i = 0 ; i < smp_num_cpus; i++)
+       for (i = 0 ; i < smp_num_cpus; i++) {
+               unsigned int user_i, nice_i, system_i;
+               int cpu = cpu_logical_map(i);
+
+               user_i = cpu_stat[cpu].user;
+               nice_i = cpu_stat[cpu].nice;
+               system_i = cpu_stat[cpu].system;
+
                len += sprintf(page + len, "cpu%d %u %u %u %lu\n",
                        i,
-                       kstat.per_cpu_user[cpu_logical_map(i)],
-                       kstat.per_cpu_nice[cpu_logical_map(i)],
-                       kstat.per_cpu_system[cpu_logical_map(i)],
-                       jif - (  kstat.per_cpu_user[cpu_logical_map(i)] \
-                                  + kstat.per_cpu_nice[cpu_logical_map(i)] \
-                                  + kstat.per_cpu_system[cpu_logical_map(i)]));
+                       user_i, nice_i, system_i,
+                       jif - (  user_i + nice_i + system_i ) );
+       }
        len += sprintf(page + len,
                "page %u %u\n"
                 "swap %u %u\n"
@@ -330,13 +335,64 @@
                "\nctxt %u\n"
                "btime %lu\n"
                "processes %lu\n",
-               kstat.context_swtch,
+               ctxt,
                xtime.tv_sec - jif / HZ,
                total_forks);

        return proc_calc_metrics(page, start, off, count, eof, len);
 }

+static int cstat_read_proc(char *page, char **start, off_t off,
+                                int count, int *eof, void *data)
+{
+       int i, len;
+
+       len = sprintf(page, "cpu_stat 0.0\n");
+
+       for (i = 0 ; i < smp_num_cpus; i++) {
+               unsigned int user, nice, system;
+               int j, cpu = cpu_logical_map(i);
+               struct cpu_stat *cs = &cpu_stat[cpu];
+
+#if !defined(CONFIG_ARCH_S390)
+               len += sprintf(page + len, "cpu%d irqs ",  cpu);
+               for (j = 0 ; j < NR_IRQS ; j++) {
+                       len += sprintf(page + len, " %u",
+                               cs->irqs[j]);
+               }
+               len += sprintf(page + len, "\n");
+#endif
+#if defined(CONFIG_SMP)
+               len += sprintf(page + len, "cpu%d context_migration %u\n",  
+                       cpu, cs->context_migration);
+#endif
+               len += sprintf(page + len, "cpu%d context_switches %u\n",  
+                       cpu, cs->context_swtch);
+
+               len += sprintf(page + len, "cpu%d major_faults %u\n",  
+                       cpu, cs->major_fault);
+               len += sprintf(page + len, "cpu%d minor_faults %u\n",  
+                       cpu, cs->minor_fault);
+               len += sprintf(page + len, "cpu%d cow_faults %u\n",  
+                       cpu, cs->cow_fault);
+
+               user = cs->user;
+               nice = cs->nice;
+               system = cs->system;
+
+               len += sprintf(page + len, "cpu%d user_time %u\n",  
+                       cpu, user);
+               len += sprintf(page + len, "cpu%d nice_time %u\n",  
+                       cpu, nice);
+               len += sprintf(page + len, "cpu%d system_time %u\n",  
+                       cpu, system);
+               len += sprintf(page + len, "cpu%d unaccounted_time %lu\n",  
+                       cpu, jiffies - (  user + nice + system ) );
+       }
+
+       return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
 static int devices_read_proc(char *page, char **start, off_t off,
                                 int count, int *eof, void *data)
 {
@@ -532,6 +588,7 @@
                {"ksyms",     ksyms_read_proc},
 #endif
                {"stat",      kstat_read_proc},
+               {"cpustat",   cstat_read_proc},
                {"devices",   devices_read_proc},
                {"partitions",        partitions_read_proc},
 #if !defined(CONFIG_ARCH_S390)
--- linux-2.4.6-pre8/kernel/sched.c.cpustat     Mon Jul  2 15:04:21 2001
+++ linux-2.4.6-pre8/kernel/sched.c     Mon Jul  2 15:04:49 2001
@@ -107,6 +107,8 @@

 struct kernel_stat kstat;

+struct cpu_stat cpu_stat[NR_CPUS] __cacheline_aligned = { { 0, } };
+
 #ifdef CONFIG_SMP

 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
@@ -607,6 +609,7 @@
        sched_data->curr = next;
 #ifdef CONFIG_SMP
        next->has_cpu = 1;
+       cpu_stat[this_cpu].context_migration += (next->processor != this_cpu);
        next->processor = this_cpu;
 #endif
        spin_unlock_irq(&runqueue_lock);
@@ -632,7 +635,7 @@

 #endif /* CONFIG_SMP */

-       kstat.context_swtch++;
+       cpu_stat[this_cpu].context_swtch++;
        /*
         * there are 3 processes which are affected by a context switch:
         *
--- linux-2.4.6-pre8/kernel/timer.c.cpustat     Mon Jul  2 15:04:21 2001
+++ linux-2.4.6-pre8/kernel/timer.c     Mon Jul  2 15:04:49 2001
@@ -588,12 +588,12 @@
                        p->need_resched = 1;
                }
                if (p->nice > 0)
-                       kstat.per_cpu_nice[cpu] += user_tick;
+                       cpu_stat[cpu].nice += user_tick;
                else
-                       kstat.per_cpu_user[cpu] += user_tick;
-               kstat.per_cpu_system[cpu] += system;
+                       cpu_stat[cpu].user += user_tick;
+               cpu_stat[cpu].system += system;
        } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1)
-               kstat.per_cpu_system[cpu] += system;
+               cpu_stat[cpu].system += system;
 }

 /*
--- linux-2.4.6-pre8/mm/memory.c.cpustat        Mon Jul  2 15:04:21 2001
+++ linux-2.4.6-pre8/mm/memory.c        Mon Jul  2 15:04:49 2001
@@ -48,6 +48,8 @@
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>

+#include <linux/kernel_stat.h>
+
 unsigned long max_mapnr;
 unsigned long num_physpages;
 void * high_memory;
@@ -931,6 +933,7 @@
                        break;
                flush_cache_page(vma, address);
                establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
+               THIS_CPU_STAT_ADD(minor_fault, 1);
                return 1;       /* Minor fault */
        }

@@ -955,6 +958,7 @@
                new_page = old_page;
        }
        page_cache_release(new_page);
+       THIS_CPU_STAT_ADD(cow_fault, 1);
        return 1;       /* Minor fault */

 bad_wp_page:
@@ -1250,6 +1254,7 @@

        /* no need to invalidate: a not-present page shouldn't be cached */
        update_mmu_cache(vma, address, entry);
+       THIS_CPU_STAT_ADD(major_fault, 1);
        return 2;       /* Major fault */
 }

--- linux-2.4.6-pre8/include/linux/kernel_stat.h.cpustat        Fri May 25 18:01:27 2001
+++ linux-2.4.6-pre8/include/linux/kernel_stat.h        Mon Jul  2 15:09:45 2001
@@ -16,9 +16,6 @@
 #define DK_MAX_DISK 16

 struct kernel_stat {
-       unsigned int per_cpu_user[NR_CPUS],
-                    per_cpu_nice[NR_CPUS],
-                    per_cpu_system[NR_CPUS];
        unsigned int dk_drive[DK_MAX_MAJOR][DK_MAX_DISK];
        unsigned int dk_drive_rio[DK_MAX_MAJOR][DK_MAX_DISK];
        unsigned int dk_drive_wio[DK_MAX_MAJOR][DK_MAX_DISK];
@@ -26,17 +23,35 @@
        unsigned int dk_drive_wblk[DK_MAX_MAJOR][DK_MAX_DISK];
        unsigned int pgpgin, pgpgout;
        unsigned int pswpin, pswpout;
-#if !defined(CONFIG_ARCH_S390)
-       unsigned int irqs[NR_CPUS][NR_IRQS];
-#endif
        unsigned int ipackets, opackets;
        unsigned int ierrors, oerrors;
        unsigned int collisions;
-       unsigned int context_swtch;
 };

 extern struct kernel_stat kstat;

+struct cpu_stat {
+       unsigned int user, nice, system;
+       unsigned long major_fault, minor_fault, cow_fault;
+       unsigned int context_swtch;
+#if defined(CONFIG_SMP)
+       unsigned long context_migration;
+#endif
+#if !defined(CONFIG_ARCH_S390)
+       unsigned int irqs[NR_IRQS];
+#endif
+
+       char __padding_dummy[0] ____cacheline_aligned;
+};
+
+extern struct cpu_stat cpu_stat[NR_CPUS];
+
+#define THIS_CPU_STAT_ADD(STAT, VAL) cpu_stat[current->processor].STAT += VAL
+/*
+ * 'CPU' as returned by smp_processor_id() or cpu_logical_map(0..smp_num_cpus)
+ */
+#define CPU_STAT_IRQ_INC(CPU, IRQ) cpu_stat[CPU].irqs[IRQ]++
+
 #if !defined(CONFIG_ARCH_S390)
 /*
  * Number of interrupts per specific IRQ source, since bootup
@@ -46,7 +61,7 @@
        int i, sum=0;

        for (i = 0 ; i < smp_num_cpus ; i++)
-               sum += kstat.irqs[cpu_logical_map(i)][irq];
+               sum += cpu_stat[cpu_logical_map(i)].irqs[irq];

        return sum;
 }
--- linux-2.4.6-pre8/arch/i386/kernel/irq.c.cpustat     Mon Jul  2 15:04:19 2001
+++ linux-2.4.6-pre8/arch/i386/kernel/irq.c     Mon Jul  2 15:09:58 2001
@@ -152,7 +152,7 @@
 #else
                for (j = 0; j < smp_num_cpus; j++)
                        p += sprintf(p, "%10u ",
-                               kstat.irqs[cpu_logical_map(j)][i]);
+                               cpu_stat[cpu_logical_map(j)].irqs[i]);
 #endif
                p += sprintf(p, " %14s", irq_desc[i].handler->typename);
                p += sprintf(p, "  %s", action->name);
@@ -575,7 +575,7 @@
        struct irqaction * action;
        unsigned int status;

-       kstat.irqs[cpu][irq]++;
+       CPU_STAT_IRQ_INC(cpu, irq);
        spin_lock(&desc->lock);
        desc->handler->ack(irq);
        /*
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

[PATCH] struct kernel_stat -> struct cpu_stat[NR_CPUS]

Post by Andreas Dilge » Wed, 04 Jul 2001 16:00:12


Quote:Zack writes:
> These per cpu statistics are reported via a new /proc/cpustat, a quick
> tool for processing that output, vmstat-style, can be found near

Could you consider /proc/cpu/0/stats or similar?  It is much nicer
than polluting the top-level /proc directory, and I believe there
are a bunch of other per-cpu items waiting to go there as well
(process binding, hot-swap CPU stuff, etc)

Cheers, Andreas
--
Andreas Dilger  \ "If a man ate a pound of pasta and a pound of antipasto,
                 \  would they cancel out, leaving him still hungry?"
http://www-mddsp.enel.ucalgary.ca/People/adilger/               -- Dogbert
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

[PATCH] struct kernel_stat -> struct cpu_stat[NR_CPUS]

Post by Pavel Mache » Sat, 14 Jul 2001 05:50:10


Hi!

Quote:> > These per cpu statistics are reported via a new /proc/cpustat, a quick
> > tool for processing that output, vmstat-style, can be found near

> Could you consider /proc/cpu/0/stats or similar?  It is much nicer
> than polluting the top-level /proc directory, and I believe there
> are a bunch of other per-cpu items waiting to go there as well
> (process binding, hot-swap CPU stuff, etc)

Add throttling, C-states, and similar acpi stuff. I'd like it to go
into /proc/cpu/0 rather than be buried somewhere into /proc/acpi.

                                                                Pavel
--


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

1. [PATCH] cutting up struct kernel_stat into cpu_stat

The attached patch-in-progress removes the per-cpu statistics from
struct kernel_stat and puts them in a cpu_stat structure, one per cpu,
cacheline padded.  The data is still coolated and presented through
/proc/stat, but another file /proc/cpustat is also added.  The locking
is as nonexistant as it was with kernel_stat, but who cares, they're
just fuzzy stats to be eyeballed by system tuners :).

A tool for printing the cpu stats specifically can be found near:

        http://www.osdlab.org/sw_resources/cpustat/index.shtml

Its output is almost identical to solaris' mpstat.  

I'm not sure I like the macro use, but it shields the callers from the
union garbage.  We can easily also make a THIS_CPU_STAT_ADD() interface,
as some have hinted would be nice :)

Currently its mostly ( :) ) only collecting the stats that were
collected in kernel_stat.  I'd like to add more stats -- page faults,
syscalls, cross-cpu calls, etc.  I understand people not wanting more
live cachelines in the fast paths.  I can make CPU_CRITICAL_STAT defines
that are config-ed out..

comments?  If its ok I can whip up a patch that updates all the ports
use of ->irqs[] as well.

- z
[ heading out for lunch :) ]

  cpustat-2.4.5-1.diff
7K Download

2. How do I make a .xinitrc file

3. 2.4.7-pre3 kernel_stat -> cpu_stat[NR_CPUS]

4. Serial ports on netbsd for MVME147?

5. struct kernel_stat kstat; /* question */

6. [patch] ultra-scalable O(1) SMP and UP scheduler

7. Can I extend struct kernel_stat?

8. will it possible, without.....

9. converting 'struct stat' to 'struct __old_kernel_stat'?

10. structs declared inside structs in /usr/include

11. timer_create refers to struct sigevent; struct sigevent not availabe unders same conditions as timer_create

12. getting "struct pci_dev" from "struct netdevice"

13. Patch/resubmit(2.5.50): Use struct io_restrictions in blkdev.h