Rmap speedup

Rmap speedup

Post by Daniel Phillip » Sun, 04 Aug 2002 04:50:07



This patch eliminates about 35% of the raw rmap setup/teardown overhead by
adopting a new locking interface that allows the add_rmaps to be batched in
copy_page_range.  This is work-in-progress.  I expect to show a further 35%
overhead reduction shortly, by batching the remove_rmaps as well.  Further
gains will come more slowly, but I hope that an immediate 70% reduction in
overhead gets us into the doesn't-suck-too-much range, and we can move on
to other benchmarks.

This patch is against 2.4.19-pre7+rmap13b.  I'll forward port to 2.5 in due
course, however this should allow you to verify my results.

Here is the script I used, essentially the same as the one you originally
posted, but all in one piece:

---------------------------
#!/bin/sh

doit()
{
        ( cat $1 | wc -l )

}

doitlots()
{
count=0

while (( count<500 ))
do
        doit foo >/dev/null

        count=$(expr $count + 1)
done
echo done

}

echo hello >foobar
rm -f foocount
echo >foocount

count=0
while (( count<10 ))
do
        doitlots foobar >>foocount &
        let count++
done

count=0
while (( count<10 ))
do
        count=$(cat foocount | wc -l)
done
---------------------------

--- 2.4.19-pre7.clean/include/linux/mm.h        Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/include/linux/mm.h      Fri Aug  2 17:45:04 2002
@@ -131,8 +131,10 @@
        struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused);
 };

-/* forward declaration; pte_chain is meant to be internal to rmap.c */
-struct pte_chain;
+struct pte_chain {
+       struct pte_chain * next;
+       pte_t * ptep;
+};

 /*
  * Each physical page in the system has a struct page associated with
@@ -324,29 +326,40 @@
 #define PageLaunder(page)      test_bit(PG_launder, &(page)->flags)
 #define SetPageLaunder(page)   set_bit(PG_launder, &(page)->flags)

-/*
- * inlines for acquisition and release of PG_chainlock
- */
-static inline void pte_chain_lock(struct page *page)
+#define num_rmap_locks (1 << 8)
+
+extern spinlock_t rmap_locks[num_rmap_locks];
+
+void init_rmap_locks(void);
+
+static inline unsigned long rmap_locknum(unsigned long index)
 {
-       /*
-        * Assuming the lock is uncontended, this never enters
-        * the body of the outer loop. If it is contended, then
-        * within the inner loop a non-atomic test is used to
-        * busywait with less bus contention for a good time to
-        * attempt to acquire the lock bit.
-        */
-       while (test_and_set_bit(PG_chainlock, &page->flags)) {
-               while (test_bit(PG_chainlock, &page->flags))
-                       cpu_relax();
-       }
+       return (index >> 4) & (num_rmap_locks - 1);
 }

-static inline void pte_chain_unlock(struct page *page)
+static inline spinlock_t *lock_rmap(struct page *page)
 {
-       clear_bit(PG_chainlock, &page->flags);
+       unsigned long index = page->index;
+       while (1) {
+               spinlock_t *lock = rmap_locks + rmap_locknum(index);
+               spin_lock(lock);
+               if (index == page->index)
+                       return lock;
+               spin_unlock(lock);
+       }      
 }

+static inline void set_page_index(struct page *page, unsigned long index)
+{
+       spinlock_t *lock = lock_rmap(page);
+       page->index = index;
+       spin_unlock(lock);
+}
+
+struct pte_chain *pte_chain_alloc(zone_t *zone);
+void pte_chain_push(zone_t *zone, struct pte_chain *pte_chain);
+void add_rmap_nolock(struct page* page, pte_t *ptep, struct pte_chain *pte_chain);
+
 /*
  * The zone field is never updated after free_area_init_core()
  * sets it, so none of the operations on it need to be atomic.
@@ -519,7 +532,7 @@
 extern int shmem_zero_setup(struct vm_area_struct *);

 extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size);
-extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma);
+extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma, unsigned *locknum);
 extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
 extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot);

--- 2.4.19-pre7.clean/kernel/fork.c     Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/kernel/fork.c   Fri Aug  2 16:25:22 2002
@@ -132,6 +132,7 @@
 {
        struct vm_area_struct * mpnt, *tmp, **pprev;
        int retval;
+       unsigned rmap_locknum = -1;

        flush_cache_mm(current->mm);
        mm->locked_vm = 0;
@@ -191,7 +192,7 @@
                *pprev = tmp;
                pprev = &tmp->vm_next;
                mm->map_count++;
-               retval = copy_page_range(mm, current->mm, tmp);
+               retval = copy_page_range(mm, current->mm, tmp, &rmap_locknum);
                spin_unlock(&mm->page_table_lock);

                if (tmp->vm_ops && tmp->vm_ops->open)
--- 2.4.19-pre7.clean/mm/bootmem.c      Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/bootmem.c    Fri Aug  2 16:25:22 2002
@@ -61,6 +61,8 @@
         */
        memset(bdata->node_bootmem_map, 0xff, mapsize);

+       init_rmap_locks(); // is there a better place for this?
+
        return mapsize;
 }

--- 2.4.19-pre7.clean/mm/filemap.c      Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/filemap.c    Fri Aug  2 16:25:22 2002
@@ -635,7 +635,7 @@
        if (!PageLocked(page))
                BUG();

-       page->index = index;
+       set_page_index(page, index);
        page_cache_get(page);
        spin_lock(&pagecache_lock);
        add_page_to_inode_queue(mapping, page);
@@ -658,7 +658,7 @@
        flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
        page->flags = flags | (1 << PG_locked);
        page_cache_get(page);
-       page->index = offset;
+       set_page_index(page, offset);
        add_page_to_inode_queue(mapping, page);
        add_page_to_hash_queue(page, hash);
 }
--- 2.4.19-pre7.clean/mm/memory.c       Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/memory.c     Fri Aug  2 17:48:29 2002
@@ -176,13 +176,17 @@
  * dst->page_table_lock is held on entry and exit,
  * but may be dropped within pmd_alloc() and pte_alloc().
  */
+struct pte_chain *pte_chain_alloc(zone_t *zone);
+
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
-                       struct vm_area_struct *vma)
+                       struct vm_area_struct *vma, unsigned *unused_locknum)
 {
        pgd_t * src_pgd, * dst_pgd;
        unsigned long address = vma->vm_start;
        unsigned long end = vma->vm_end;
        unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+       zone_t *pte_chain_zone = zone_table[ZONE_NORMAL];
+       struct pte_chain *local_pte_chain = NULL, *pte_chain;

        src_pgd = pgd_offset(src, address)-1;
        dst_pgd = pgd_offset(dst, address)-1;
@@ -212,6 +216,8 @@

                do {
                        pte_t * src_pte, * dst_pte;
+                       unsigned last_locknum = -1;
+                       spinlock_t *rmap_lock = NULL;

                        /* copy_pte_range */

@@ -247,6 +253,28 @@
                                        goto cont_copy_pte_range_noset;
                                }
                                ptepage = pte_page(pte);
+
+                               if (!local_pte_chain) {
+                                       unsigned more = 16;
+                                       if (last_locknum != -1) {
+                                               spin_unlock(rmap_lock);
+                                               last_locknum = -1;
+                                       }
+                                       while (more--) {
+                                               struct pte_chain *new = pte_chain_alloc(pte_chain_zone);
+                                               new->next = local_pte_chain;
+                                               local_pte_chain = new;
+                                       }
+                               }
+
+                               if (last_locknum != rmap_locknum(ptepage->index)) {
+                                       if (last_locknum != -1) {
+
+                                               spin_unlock(rmap_lock);
+                                       }
+                                       rmap_lock = lock_rmap(ptepage);
+                                       last_locknum = rmap_locknum(ptepage->index);
+                               }
                                if ((!VALID_PAGE(ptepage)) ||
                                    PageReserved(ptepage))
                                        goto cont_copy_pte_range;
@@ -265,15 +293,24 @@
                                dst->rss++;

 cont_copy_pte_range:           set_pte(dst_pte, pte);
-                               page_add_rmap(ptepage, dst_pte);
+                               pte_chain = local_pte_chain;
+                               local_pte_chain = local_pte_chain->next;
+                               add_rmap_nolock(ptepage, dst_pte, pte_chain);
+
 cont_copy_pte_range_noset:     address += PAGE_SIZE;
-                               if (address >= end)
+                               if (address >= end) {
+                                       if (last_locknum != -1)
+                                               spin_unlock(rmap_lock);
                                        goto out_unlock;
+                               }
                                src_pte++;
                                dst_pte++;
                        } while ((unsigned long)src_pte & PTE_TABLE_MASK);
                        spin_unlock(&src->page_table_lock);
-              
+
+                       if (last_locknum != -1)
+                               spin_unlock(rmap_lock);
+
 cont_copy_pmd_range:   src_pmd++;
                        dst_pmd++;
                } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
@@ -281,6 +318,13 @@
 out_unlock:
        spin_unlock(&src->page_table_lock);
 out:
+       spin_lock(&pte_chain_zone->pte_chain_freelist_lock);
+       while (local_pte_chain) {
+               struct pte_chain *next = local_pte_chain->next;
+               pte_chain_push(pte_chain_zone, local_pte_chain);
+               local_pte_chain = next;
+       }
+       spin_unlock(&pte_chain_zone->pte_chain_freelist_lock);
        return 0;
 nomem:
        return -ENOMEM;
@@ -1518,3 +1562,4 @@
        }
        return page;
 }
+
--- 2.4.19-pre7.clean/mm/page_alloc.c   Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/page_alloc.c Fri Aug  2 17:49:36 2002
@@ -213,6 +213,7 @@

                if (curr != head) {
                        unsigned int index;
+                       static unsigned foo_page_allocs;

                        page = memlist_entry(curr, struct page, list);
                        if (BAD_RANGE(zone,page))
@@ -227,6 +228,7 @@
                        spin_unlock_irqrestore(&zone->lock, flags);

                        set_page_count(page, 1);
+                       page->index = foo_page_allocs++ >> PAGE_CACHE_SHIFT;
                        if (BAD_RANGE(zone,page))
                                BUG();
                        DEBUG_LRU_PAGE(page);
--- 2.4.19-pre7.clean/mm/rmap.c Wed Jul 31 00:38:09 2002
+++ 2.4.19-pre7/mm/rmap.c       Fri Aug  2 17:33:51 2002
@@ -43,16 +43,20 @@
  * in systems with long-lived applications the relative overhead of
  * exit() will be lower since the applications are long-lived.
  */
-struct pte_chain {
-       struct pte_chain * next;
-       pte_t * ptep;
-};

-static inline struct pte_chain * pte_chain_alloc(zone_t *);
+spinlock_t rmap_locks[num_rmap_locks];
+
 static inline void pte_chain_free(struct pte_chain *, struct pte_chain *,
                struct page *, zone_t *);
 static void alloc_new_pte_chains(zone_t *);

+void init_rmap_locks(void)
+{
+       int i = 0;
+       while (i < num_rmap_locks)
+               spin_lock_init(rmap_locks + i++);
+}
+
 /**
  * page_referenced - test if the page was referenced
  * @page: the page to test
@@ -86,9 +90,10 @@
  * Add a new pte reverse mapping to a page.
  * The caller needs to hold the mm->page_table_lock.
  */
-void page_add_rmap(struct page * page, pte_t * ptep)
+void page_add_rmap(struct page *page, pte_t ...

read more »

 
 
 

Rmap speedup

Post by Andrew Morto » Sun, 04 Aug 2002 05:30:09



> This patch eliminates about 35% of the raw rmap setup/teardown overhead by
> adopting a new locking interface that allows the add_rmaps to be batched in
> copy_page_range.

Well that's fairly straightforward, thanks.  Butt-ugly though ;)

Don't bother doing teardown yet.  I have patches which batch
all the zap_page_range activity into 16-page chunks, so we
eventually end up in a single function with 16 virtually-contiguous
pages to release.  Adding the batched locking to that will
be simple.

Sigh.  I have a test which sends the 2.5.30 VM into a five-minute
coma and which immediately panics latest -ac with pte_chain oom.
Remind me again why all this is worth it?

I'll port your stuff to 2.5 over the weekend, let you know...
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

Rmap speedup

Post by William Lee Irwin II » Sun, 04 Aug 2002 06:50:03



> Sigh.  I have a test which sends the 2.5.30 VM into a five-minute
> coma and which immediately panics latest -ac with pte_chain oom.
> Remind me again why all this is worth it?
> I'll port your stuff to 2.5 over the weekend, let you know...

I wrote the test (or is this the one I wrote?), I'll fix it. I've
already arranged to sleep in the right places.

Cheers,
Bill
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

Rmap speedup

Post by Rik van Rie » Sun, 04 Aug 2002 09:20:05




> > This patch eliminates about 35% of the raw rmap setup/teardown overhead by
> > adopting a new locking interface that allows the add_rmaps to be batched in
> > copy_page_range.

> Well that's fairly straightforward, thanks.  Butt-ugly though ;)

It'd be nice if the code would be a bit more beautiful and the
reverse mapping scheme more modular.

Remember that we're planning to go to an object-based scheme
later on, turning the code into a big monolithic mesh really
makes long-term maintenance a pain...

regards,

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/             http://distro.conectiva.com/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

Rmap speedup

Post by Andrew Morto » Sun, 04 Aug 2002 09:40:05





> > > This patch eliminates about 35% of the raw rmap setup/teardown overhead by
> > > adopting a new locking interface that allows the add_rmaps to be batched in
> > > copy_page_range.

> > Well that's fairly straightforward, thanks.  Butt-ugly though ;)

> It'd be nice if the code would be a bit more beautiful and the
> reverse mapping scheme more modular.

I changed it to, essentially:

foo()
{
        spinlock_t *rmap_lock = NULL;
        unsigned rmap_lockno = -1;
        ...
        for (stuff) {
                cached_rmap_lock(page, &rmap_lock, &rmap_lockno);
                __page_add_rmap(page, ptep);
                ..
        }
        drop_rmap_lock(&rmap_lock, &rmap_lockno);

Quote:}

See http://www.zip.com.au/~akpm/linux/patches/2.5/2.5.30/daniel-rmap-spee...

Fixing zap_pte_range pretty much requires the pagemap_lru_lock
rework; otherwise we couldn't hold the rmap lock across
tlb_remove_page().

Quote:> Remember that we're planning to go to an object-based scheme
> later on, turning the code into a big monolithic mesh really
> makes long-term maintenance a pain...

We have short-term rmap problems:

1) Unexplained pte chain state with ntpd
2) 10-20% increased CPU load in fork/exec/exit loads
3) system lock under heavy mmap load
4) ZONE_NORMAL pte_chain consumption

Daniel and I are on 2), Bill is on 4) (I think).
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

Rmap speedup

Post by William Lee Irwin II » Sun, 04 Aug 2002 10:00:09



>> Remember that we're planning to go to an object-based scheme
>> later on, turning the code into a big monolithic mesh really
>> makes long-term maintenance a pain...

> We have short-term rmap problems:
> 1) Unexplained pte chain state with ntpd
> 2) 10-20% increased CPU load in fork/exec/exit loads
> 3) system lock under heavy mmap load
> 4) ZONE_NORMAL pte_chain consumption

> Daniel and I are on 2), Bill is on 4) (I think).

I am indeed on (4), though I'd describe what I'm doing as "OOM handling".

Cheers,
Bill
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

Rmap speedup

Post by Rik van Rie » Sun, 04 Aug 2002 10:00:11



> > > Well that's fairly straightforward, thanks.  Butt-ugly though ;)
> I changed it to, essentially:
> See http://www.zip.com.au/~akpm/linux/patches/2.5/2.5.30/daniel-rmap-spee...

This patch looks good.  Good enough for long-term maintainability,
even... ;)

I like it.

Quote:> We have short-term rmap problems:

> 1) Unexplained pte chain state with ntpd

I'll do a detailed trace of xntpd to see what's happening...

Quote:> 2) 10-20% increased CPU load in fork/exec/exit loads
> 3) system lock under heavy mmap load
> 4) ZONE_NORMAL pte_chain consumption

> Daniel and I are on 2), Bill is on 4) (I think).

regards,

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/             http://distro.conectiva.com/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

Rmap speedup

Post by Daniel Phillip » Sun, 04 Aug 2002 12:50:05




> > This patch eliminates about 35% of the raw rmap setup/teardown overhead by
> > adopting a new locking interface that allows the add_rmaps to be batched
> > in copy_page_range.

> Well that's fairly straightforward, thanks.  Butt-ugly though ;)

I could try "fast is beautiful" or "beauty is in the eye of the beholder",
but I think I'll stick with "beauty isn't the point just now".

Quote:> Don't bother doing teardown yet.  I have patches which batch
> all the zap_page_range activity into 16-page chunks, so we
> eventually end up in a single function with 16 virtually-contiguous
> pages to release.  Adding the batched locking to that will
> be simple.

Great.  Well, both the locking locality of anonymous pages and the dispersion
of mmaped pages could be improved considrably, so maybe I'll play around with
those a little.  Taking a wild guess, it might be good for another 5-10%
overhead reduction, and won't impact the basic structure.

Quote:> Sigh.  I have a test which sends the 2.5.30 VM into a five-minute
> coma

That doesn't sound like a rmap problem per se.  Is the test posted?

Quote:> and which immediately panics latest -ac with pte_chain oom.
> Remind me again why all this is worth it?

It will be worth it when we finally have a system that swaps well and doesn't
die if you throw a lot of disk IO at it (like BSD).  It will be doubly worth
it when active defragmentation happens.

What we will end up with at the end of this cycle will have all the solidity
and flexibility of the BSD VM with little of the complexity.  According to me
anyway ;-)

--
Daniel
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

Rmap speedup

Post by Andrew Morto » Sun, 04 Aug 2002 14:20:05


No joy, I'm afraid.

2.5.26:

./daniel.sh  39.78s user 71.72s system 368% cpu 30.260 total
quad:/home/akpm> time ./daniel.sh
./daniel.sh  38.45s user 70.00s system 365% cpu 29.642 total

c0132b0c 328      1.03288     free_page_and_swap_cache
c013074c 334      1.05177     lru_cache_add          
c0112a64 428      1.34778     do_page_fault          
c0144e90 434      1.36667     link_path_walk          
c01388b4 458      1.44225     do_page_cache_readahead
c01263e0 468      1.47374     clear_page_tables      
c01319b0 479      1.50838     __free_pages_ok        
c01e0700 514      1.61859     radix_tree_lookup      
c012a8a8 605      1.90515     find_get_page          
c01079f8 640      2.01537     page_fault              
c01127d4 649      2.04371     pte_alloc_one          
c0131ca0 811      2.55385     rmqueue                
c0127cc8 1152     3.62766     do_anonymous_page      
c013230c 1421     4.47474     page_cache_release      
c0126880 1544     4.86207     zap_pte_range          
c012662c 1775     5.58949     copy_page_range        
c0127e70 1789     5.63358     do_no_page              
c012750c 6860     21.6022     do_wp_page              

Stock 2.5.30:

./daniel.sh  36.60s user 88.23s system 366% cpu 34.029 total
quad:/home/akpm> time ./daniel.sh
./daniel.sh  37.22s user 87.88s system 354% cpu 35.288 total

c014fdc4 191      0.872943    __d_lookup              
c01310c0 203      0.927788    kmem_cache_alloc_batch  
c0114154 227      1.03748     do_page_fault          
c0146ea8 243      1.1106      link_path_walk          
c0132fd0 257      1.17459     __free_pages_ok        
c0134284 279      1.27514     free_page_and_swap_cache
c0131538 309      1.41225     kmem_cache_free        
c0107c90 320      1.46252     page_fault              
c012ca48 326      1.48995     find_get_page          
c012a220 349      1.59506     handle_mm_fault        
c0128520 360      1.64534     clear_page_tables      
c0113ed0 367      1.67733     pte_alloc_one          
c013129c 399      1.82358     kmem_cache_alloc        
c01332bc 453      2.07038     rmqueue                
c0129df4 557      2.5457      do_anonymous_page      
c013392c 689      3.14899     page_cache_release      
c0128a60 832      3.80256     zap_pte_range          
c0129fa0 893      4.08135     do_no_page              
c0128828 1081     4.94059     copy_page_range        
c013aa74 1276     5.83181     page_add_rmap          
c013ab3c 3094     14.1408     page_remove_rmap        
c01296a8 3466     15.841      do_wp_page              

2.5.30+pagemap_lru_lock stuff

quad:/home/akpm> time ./daniel.sh
./daniel.sh  41.01s user 97.15s system 373% cpu 36.996 total
quad:/home/akpm> time ./daniel.sh
./daniel.sh  36.67s user 87.04s system 368% cpu 33.575 total                    

c0131d60 230      1.08979     kmem_cache_alloc_batch  
c0148728 231      1.09453     link_path_walk          
c01321d8 238      1.12769     kmem_cache_free        
c01142b4 240      1.13717     do_page_fault          
c0135624 291      1.37882     free_page_and_swap_cache
c012a8cc 323      1.53044     handle_mm_fault        
c0128790 326      1.54466     clear_page_tables      
c0107c90 338      1.60152     page_fault              
c0131f3c 350      1.65837     kmem_cache_alloc        
c0113f20 373      1.76735     pte_alloc_one          
c012d2a8 397      1.88107     find_get_page          
c013466c 415      1.96636     rmqueue                
c0132f74 449      2.12746     __pagevec_release      
c012a3bc 532      2.52073     do_anonymous_page      
c012a5b0 772      3.6579      do_no_page              
c0128da0 854      4.04643     zap_pte_range          
c0128b48 1031     4.8851      copy_page_range        
c013c054 1244     5.89434     page_add_rmap          
c013c11c 3088     14.6316     page_remove_rmap        
c0129b58 3206     15.1907     do_wp_page              

2.5.30+pagemap_lru_lock+this patch:

quad:/home/akpm> time ./daniel.sh
./daniel.sh  38.78s user 91.56s system 366% cpu 35.534 total
quad:/home/akpm> time ./daniel.sh
./daniel.sh  38.07s user 88.64s system 363% cpu 34.883 total

c0135a90 332      1.30853     free_page_and_swap_cache
c013c57c 332      1.30853     page_add_rmap          
c012ad4d 337      1.32824     .text.lock.memory      
c0132448 353      1.3913      kmem_cache_free        
c0128790 372      1.46618     clear_page_tables      
c0107c90 377      1.48589     page_fault              
c01142b4 423      1.66719     do_page_fault          
c0113f20 432      1.70266     pte_alloc_one          
c012d518 438      1.72631     find_get_page          
c013c91c 438      1.72631     .text.lock.rmap        
c01321ac 443      1.74602     kmem_cache_alloc        
c012aafc 453      1.78543     handle_mm_fault        
c01349fc 463      1.82485     rmqueue                
c012a5ec 655      2.58159     do_anonymous_page      
c01331e4 748      2.94813     __pagevec_release      
c012a7e0 992      3.90982     do_no_page              
c0128e90 1426     5.62037     zap_pte_range          
c0128b48 1586     6.25099     copy_page_range        
c013c5c8 2324     9.1597      __page_remove_rmap      
c0129d88 4028     15.8758     do_wp_page              

- page_add_rmap has vanished
- page_remove_rmap has halved (80% of the remaining is the
  list walk)
- we've moved the cost into the new locking site, zap_pte_range
  and copy_page_range.

So rmap locking is still a 15% slowdown on my soggy quad, which generally
seems relatively immune to locking costs.  PPC will like the change
because spinlocks are better than bitops.   ia32 should have liked it
for the same reason but, as I say, this machine doesn't seem to have
the bandwidth*latency to be affected much by these things.

On more modern machines and other architectures this remains
a significant problem for rmap, I expect.

Guess we should instrument it up and make sure that the hashing
and index thing is getting the right locality.  I saw UML-for-2.5.30
whizz past, if you have time ;)

Broken out patches are at
http://www.zip.com.au/~akpm/linux/patches/2.5/2.5.30/
Rolled-up patch is at
http://www.zip.com.au/~akpm/linux/patches/2.5/2.5.30/everything.gz
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

Rmap speedup

Post by Daniel Phillip » Mon, 05 Aug 2002 03:50:05



Quote:> - page_add_rmap has vanished
> - page_remove_rmap has halved (80% of the remaining is the
>   list walk)
> - we've moved the cost into the new locking site, zap_pte_range
>   and copy_page_range.
> So rmap locking is still a 15% slowdown on my soggy quad, which generally
> seems relatively immune to locking costs.

What is it about your quad?  I'm getting the expected results here on my two
way.  I just checked that the lock hashing is doing what it's supposed to.  
It is: if I drop all the locks into a single bucket, the speedup drops by
half.

It seems odd that you're seeing effectively no change at all.  Is it possible
we lost something in translation?  What happens if you just run with the
copy_page_range side, and no changes to zap_page_range?

Quote:> PPC will like the change
> because spinlocks are better than bitops.   ia32 should have liked it
> for the same reason but, as I say, this machine doesn't seem to have
> the bandwidth*latency to be affected much by these things.

> On more modern machines and other architectures this remains
> a significant problem for rmap, I expect.

My 2X 1GHz PIII definitely likes it.

Quote:> Guess we should instrument it up and make sure that the hashing
> and index thing is getting the right locality.  I saw UML-for-2.5.30
> whizz past, if you have time ;)

I've got intrumentation for that all ready to go.  I'll break it out and send
it along.  The bucket distribution can definitely be improved, by xoring some
higher bits of the lock number with a value specific to each mapping.  The
anon page locality is poor with the simple increment-a-counter approach; we
can do much better.

But before we start on the micro-optimization we need to know why your quad
is so unaffected by the big change.  Are you sure the slab cache batching of
pte chain allocation performs as well as my simpleminded inline batching?  
(I batched the pte chain allocation lock quite nicely.)  What about the bit
test/set for the direct rmap pointer, how is performance affected by dropping
the direct lookup optimization?  Note that you are holding the rmap lock
considerably longer than I was, by holding it across __page_add_rmap instead
of just across the few instructions where pointers are actually updated.  I'm
also wondering if gcc is optimizing your cached_rmap_lock inline as well as
you think it is.

I really need to be running on 2.5 so I can crosscheck your results.  I'll
return to the matter of getting the dac960 running now.

Miscellaneous question: we are apparently adding rmaps to reserved pages, why
is that?

--
Daniel
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

Rmap speedup

Post by Rik van Rie » Mon, 05 Aug 2002 06:10:07



> No joy, I'm afraid.
> Guess we should instrument it up and make sure that the hashing
> and index thing is getting the right locality.

Could it be that your quad needs to go to RAM to grab a cacheline
that's exclusive on the other CPU while Daniel's machine can just
shove cachelines from CPU to CPU ?

What I'm referring to is the fact that the pte_chain_locks in
Daniel's patch are all packed into a few cachelines, instead of
having each lock on its own cache line...

This could explain the fact that the locking overhead plummeted
on Daniel's box while it didn't change at all on your machine.

regards,

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/             http://distro.conectiva.com/

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

Rmap speedup

Post by Daniel Phillip » Mon, 05 Aug 2002 06:30:06



Quote:> No joy, I'm afraid.

We need to eliminate some variables.  People, can we please have some smp
results for 2 way or whatever-way for the exact kernel I used:

   http://www.kernel.org/pub/linux/kernel/v2.4/linux-2.4.18.tar.bz2
   http://www.kernel.org/pub/linux/kernel/v2.4/testing/old/patch-2.4.19-...
   http://surriel.com/patches/2.4/2.4.19p7-rmap-13b

With and without this patch:

   http://people.nl.linux.org/~phillips/patches/rmap.speedup-2.4.19-pre7

Using this script:

   http://people.nl.linux.org/~phillips/patches/lots_of_forks.sh

time sh lots_of_forks.sh

Thanks.

--
Daniel
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

 
 
 

Rmap speedup

Post by Andrew Morto » Mon, 05 Aug 2002 06:40:04




> > No joy, I'm afraid.

> > Guess we should instrument it up and make sure that the hashing
> > and index thing is getting the right locality.

> Could it be that your quad needs to go to RAM to grab a cacheline
> that's exclusive on the other CPU while Daniel's machine can just
> shove cachelines from CPU to CPU ?

Could be, Rik.  I don't know.  It's a bit worrisome that we might
be dependent on subtleties like that.

Quote:> What I'm referring to is the fact that the pte_chain_locks in
> Daniel's patch are all packed into a few cachelines, instead of
> having each lock on its own cache line...

> This could explain the fact that the locking overhead plummeted
> on Daniel's box while it didn't change at all on your machine.

Oh it helped a bit.   More in 2.4 than 2.5.  Possibly I broke
Daniel's patch somehow.    But even the improvement in 2.4
from Daniel's patch is disappointing.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
 
 
 

1. Mozilla speedup

Has anyone managed to do anything to speed up Mozilla (not linux-mozilla).
Just installed it overnight only to find it takes anywhere from 4 to 6
minutes to load. Runs fine once loaded, but...
I've heard that linux-mozilla runs better. Does it? It took me ages to
download the mozilla sources, so I'm not about to update from linux-base6.1
to linux-base7.1 only to find that linux-mozilla is no better (I'm not on
the fastest of dial up connections).

TIA

Steve.

2. Looking for Lyra download software

3. Using poll speedup

4. .htaccess headaches...

5. Would putting SWAP partition in middle speed-up system?

6. glibc 2.1.x snapshot 971123

7. lots of syscalls in Cnews expire (and another possible speedup)

8. Two Network interface with the same network

9. Performance of partial object-based rmap

10. move slab pages to the lru, for rmap

11. rmap 15j

12. rmap VM, 12a

13. Kernel Boot Speedup