[RFC] vfs_bio additions, motivated by XFS for FreeBSD project

From: Craig Rodrigues (rodrigc_at_crodrigues.org)
Date: 11/12/05

  • Next message: Xin LI: "RFC: Why not move kernel MD code to sys/arch/?"
    Date: Sat, 12 Nov 2005 13:29:09 -0500
    To: freebsd-arch@freebsd.org
    
    

    Hi,

    Now that FreeBSD 6.0 is released, I would like to work
    on integrating code from the XFS for FreeBSD project into
    FreeBSD-CURRENT.

    Alexander Kabaev made some changes to vfs_bio.c which are
    needed by the XFS for FreeBSD code. In addition to some
    new functions, this patch adds three new fields
    to struct buf (b_fsprivate1, b_fsprivate2, b_fsprivate3).
    You don't see their use here, but in the XFS for FreeBSD code
    (which you can get from http://people.freebsd.org/~rodrigc/xfs/ ),
    they are used to cache certain information.

    Comments?

    --- //depot/vendor/freebsd/src/sys/kern/vfs_bio.c 2005/10/08 15:01:11
    +++ //depot/projects/src/sys/kern/vfs_bio.c 2005/10/08 16:09:54
    @@ -216,7 +216,7 @@
      */
     static struct mtx rbreqlock;
     
    -/*
    +/*
      * Synchronization (sleep/wakeup) variable for buffer requests.
      * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
      * by and/or.
    @@ -233,8 +233,12 @@
     /*
      * Lock that protects against bwait()/bdone()/B_DONE races.
      */
    +static struct mtx bdonelock;
     
    -static struct mtx bdonelock;
    +/*
    + * Lock that protects against bwait()/bdone()/B_DONE races.
    + */
    +static struct mtx bpinlock;
     
     /*
      * Definitions for the buffer free lists.
    @@ -523,6 +527,7 @@
             mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
             mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
             mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF);
    + mtx_init(&bpinlock, "bpin lock", NULL, MTX_DEF);
     
             /* next, make a null set of free lists */
             for (i = 0; i < BUFFER_QUEUES; i++)
    @@ -636,7 +641,7 @@
      * bremfree:
      *
      * Mark the buffer for removal from the appropriate free list in brelse.
    - *
    + *
      */
     void
     bremfree(struct buf *bp)
    @@ -720,18 +725,51 @@
     }
     
     /*
    + * Attempt to initiate asynchronous I/O on read-ahead blocks. We must
    + * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
    + * the buffer is valid and we do not have to do anything.
    + */
    +void
    +breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
    + int cnt, struct ucred * cred)
    +{
    + struct buf *rabp;
    + int i;
    +
    + for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
    + if (inmem(vp, *rablkno))
    + continue;
    + rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
    +
    + if ((rabp->b_flags & B_CACHE) == 0) {
    + if (curthread != PCPU_GET(idlethread))
    + curthread->td_proc->p_stats->p_ru.ru_inblock++;
    + rabp->b_flags |= B_ASYNC;
    + rabp->b_flags &= ~B_INVAL;
    + rabp->b_ioflags &= ~BIO_ERROR;
    + rabp->b_iocmd = BIO_READ;
    + if (rabp->b_rcred == NOCRED && cred != NOCRED)
    + rabp->b_rcred = crhold(cred);
    + vfs_busy_pages(rabp, 0);
    + BUF_KERNPROC(rabp);
    + rabp->b_iooffset = dbtob(rabp->b_blkno);
    + bstrategy(rabp);
    + } else {
    + brelse(rabp);
    + }
    + }
    +}
    +
    +/*
      * Operates like bread, but also starts asynchronous I/O on
    - * read-ahead blocks. We must clear BIO_ERROR and B_INVAL prior
    - * to initiating I/O . If B_CACHE is set, the buffer is valid
    - * and we do not have to do anything.
    + * read-ahead blocks.
      */
     int
     breadn(struct vnode * vp, daddr_t blkno, int size,
         daddr_t * rablkno, int *rabsize,
         int cnt, struct ucred * cred, struct buf **bpp)
     {
    - struct buf *bp, *rabp;
    - int i;
    + struct buf *bp;
             int rv = 0, readwait = 0;
     
             CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
    @@ -752,29 +790,8 @@
                     ++readwait;
             }
     
    - for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
    - if (inmem(vp, *rablkno))
    - continue;
    - rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
    + breada(vp, rablkno, rabsize, cnt, cred);
     
    - if ((rabp->b_flags & B_CACHE) == 0) {
    - if (curthread != PCPU_GET(idlethread))
    - curthread->td_proc->p_stats->p_ru.ru_inblock++;
    - rabp->b_flags |= B_ASYNC;
    - rabp->b_flags &= ~B_INVAL;
    - rabp->b_ioflags &= ~BIO_ERROR;
    - rabp->b_iocmd = BIO_READ;
    - if (rabp->b_rcred == NOCRED && cred != NOCRED)
    - rabp->b_rcred = crhold(cred);
    - vfs_busy_pages(rabp, 0);
    - BUF_KERNPROC(rabp);
    - rabp->b_iooffset = dbtob(rabp->b_blkno);
    - bstrategy(rabp);
    - } else {
    - brelse(rabp);
    - }
    - }
    -
             if (readwait) {
                     rv = bufwait(bp);
             }
    @@ -807,6 +824,10 @@
     
             if (BUF_REFCNT(bp) == 0)
                     panic("bufwrite: buffer is not busy???");
    +
    + if (bp->b_pin_count > 0)
    + bunpin_wait(bp);
    +
             KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
                 ("FFS background buffer should not get here %p", bp));
     
    @@ -1117,6 +1138,11 @@
             KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
                 ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
     
    + if (bp->b_flags & B_MANAGED) {
    + bqrelse(bp);
    + return;
    + }
    +
             if (bp->b_iocmd == BIO_WRITE &&
                 (bp->b_ioflags & BIO_ERROR) &&
                 !(bp->b_flags & B_INVAL)) {
    @@ -1286,7 +1312,7 @@
                     }
     
             }
    -
    +
             if (BUF_REFCNT(bp) > 1) {
                     /* do not release to free list */
                     BUF_UNLOCK(bp);
    @@ -1394,6 +1420,18 @@
                     BUF_UNLOCK(bp);
                     return;
             }
    +
    + if (bp->b_flags & B_MANAGED) {
    + if (bp->b_flags & B_REMFREE) {
    + mtx_lock(&bqlock);
    + bremfreel(bp);
    + mtx_unlock(&bqlock);
    + }
    + bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
    + BUF_UNLOCK(bp);
    + return;
    + }
    +
             mtx_lock(&bqlock);
             /* Handle delayed bremfree() processing. */
             if (bp->b_flags & B_REMFREE)
    @@ -1821,6 +1859,10 @@
                     bp->b_npages = 0;
                     bp->b_dirtyoff = bp->b_dirtyend = 0;
                     bp->b_bufobj = NULL;
    + bp->b_pin_count = 0;
    + bp->b_fsprivate1 = NULL;
    + bp->b_fsprivate2 = NULL;
    + bp->b_fsprivate3 = NULL;
     
                     LIST_INIT(&bp->b_dep);
     
    @@ -2059,6 +2101,10 @@
     
                     if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
                             continue;
    + if (bp->b_pin_count > 0) {
    + BUF_UNLOCK(bp);
    + continue;
    + }
                     BO_LOCK(bp->b_bufobj);
                     if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
                         (bp->b_flags & B_DELWRI) == 0) {
    @@ -2393,6 +2439,19 @@
                             if ((bp->b_flags & B_VMIO) == 0 ||
                                 (size > bp->b_kvasize)) {
                                     if (bp->b_flags & B_DELWRI) {
    + /*
    + * If buffer is pinned and caller does
    + * not want sleep waiting for it to be
    + * unpinned, bail out
    + * */
    + if (bp->b_pin_count > 0) {
    + if (flags & GB_LOCK_NOWAIT) {
    + bqrelse(bp);
    + return (NULL);
    + } else {
    + bunpin_wait(bp);
    + }
    + }
                                             bp->b_flags |= B_NOCACHE;
                                             bwrite(bp);
                                     } else {
    @@ -3034,11 +3093,11 @@
             struct bufobj *dropobj;
             void (*biodone)(struct buf *);
     
    -
             CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
             dropobj = NULL;
     
    - KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
    + KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
    + BUF_REFCNT(bp)));
             KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
     
             runningbufwakeup(bp);
    @@ -3053,6 +3112,19 @@
                             bufobj_wdrop(dropobj);
                     return;
             }
    +
    + bufdone_finish(bp);
    +
    + if (dropobj)
    + bufobj_wdrop(dropobj);
    +}
    +
    +void
    +bufdone_finish(struct buf *bp)
    +{
    + KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
    + BUF_REFCNT(bp)));
    +
             if (LIST_FIRST(&bp->b_dep) != NULL)
                     buf_complete(bp);
     
    @@ -3118,7 +3190,8 @@
                                     if (m == NULL)
                                             panic("biodone: page disappeared!");
                                     bp->b_pages[i] = m;
    - pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
    + pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
    + bp->b_pages, bp->b_npages);
                             }
     #if defined(VFS_BIO_DEBUG)
                             if (OFF_TO_IDX(foff) != m->pindex) {
    @@ -3130,7 +3203,7 @@
     
                             /*
                              * In the write case, the valid and clean bits are
    - * already changed correctly ( see bdwrite() ), so we
    + * already changed correctly ( see bdwrite() ), so we
                              * only need to do this here in the read case.
                              */
                             if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
    @@ -3185,8 +3258,6 @@
                             bqrelse(bp);
             } else
                     bdone(bp);
    - if (dropobj)
    - bufobj_wdrop(dropobj);
     }
     
     /*
    @@ -3742,6 +3813,32 @@
             return (error);
     }
     
    +void
    +bpin(struct buf *bp)
    +{
    + mtx_lock(&bpinlock);
    + bp->b_pin_count ++;
    + mtx_unlock(&bpinlock);
    +}
    +
    +void
    +bunpin(struct buf *bp)
    +{
    + mtx_lock(&bpinlock);
    + if ( --bp->b_pin_count == 0)
    + wakeup(bp);
    + mtx_unlock(&bpinlock);
    +}
    +
    +void
    +bunpin_wait(struct buf *bp)
    +{
    + mtx_lock(&bpinlock);
    + while (bp->b_pin_count > 0)
    + msleep(bp, &bpinlock, PRIBIO, "bwunpin", 0);
    + mtx_unlock(&bpinlock);
    +}
    +
     #include "opt_ddb.h"
     #ifdef DDB
     #include <ddb/ddb.h>
    @@ -3794,3 +3891,4 @@
             }
     }
     #endif /* DDB */
    +
    --- //depot/vendor/freebsd/src/sys/kern/vfs_cluster.c 2005/08/14 09:53:08
    +++ //depot/projects/src/sys/kern/vfs_cluster.c 2005/08/14 10:01:58
    @@ -765,6 +765,12 @@
                             --len;
                             continue;
                     }
    + if (tbp->b_pin_count > 0) {
    + BUF_UNLOCK(tbp);
    + ++start_lbn;
    + --len;
    + continue;
    + }
                     bremfree(tbp);
                     tbp->b_flags &= ~B_DONE;
     
    @@ -868,6 +874,15 @@
                                             BUF_UNLOCK(tbp);
                                             break;
                                     }
    +
    + /*
    + * Do not pull in pinned buffers.
    + */
    + if (tbp->b_pin_count > 0) {
    + BUF_UNLOCK(tbp);
    + break;
    + }
    +
                                     /*
                                      * Ok, it's passed all the tests,
                                      * so remove it from the free list
    @@ -979,3 +994,4 @@
             buflist->bs_nchildren = i + 1;
             return (buflist);
     }
    +
    --- //depot/vendor/freebsd/src/sys/sys/buf.h 2005/10/08 15:01:11
    +++ //depot/projects/src/sys/sys/buf.h 2005/10/08 16:09:54
    @@ -135,6 +135,10 @@
             struct vm_page *b_pages[btoc(MAXPHYS)];
             int b_npages;
             struct workhead b_dep; /* (D) List of filesystem dependencies. */
    + void *b_fsprivate1;
    + void *b_fsprivate2;
    + void *b_fsprivate3;
    + int b_pin_count;
     };
     
     #define b_object b_bufobj->bo_object
    @@ -214,7 +218,7 @@
     #define B_01000000 0x01000000 /* Available flag. */
     #define B_02000000 0x02000000 /* Available flag. */
     #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */
    -#define B_08000000 0x08000000 /* Available flag. */
    +#define B_MANAGED 0x08000000 /* Managed by FS. */
     #define B_RAM 0x10000000 /* Read ahead mark (flag) */
     #define B_VMIO 0x20000000 /* VMIO flag */
     #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */
    @@ -486,6 +490,7 @@
     void bremfree(struct buf *);
     void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */
     int bread(struct vnode *, daddr_t, int, struct ucred *, struct buf **);
    +void breada(struct vnode *, daddr_t *, int *, int, struct ucred *);
     int breadn(struct vnode *, daddr_t, int, daddr_t *, int *, int,
                 struct ucred *, struct buf **);
     void bdwrite(struct buf *);
    @@ -504,6 +509,7 @@
     int bufwait(struct buf *);
     int bufwrite(struct buf *);
     void bufdone(struct buf *);
    +void bufdone_finish(struct buf *);
     
     int cluster_read(struct vnode *, u_quad_t, daddr_t, long,
                 struct ucred *, long, int, struct buf **);
    @@ -527,7 +533,11 @@
     struct buf *trypbuf(int *);
     void bwait(struct buf *, u_char, const char *);
     void bdone(struct buf *);
    +void bpin(struct buf *);
    +void bunpin(struct buf *);
    +void bunpin_wait(struct buf *);
     
     #endif /* _KERNEL */
     
     #endif /* !_SYS_BUF_H_ */
    +

    -- 
    Craig Rodrigues        
    rodrigc@crodrigues.org
    _______________________________________________
    freebsd-arch@freebsd.org mailing list
    http://lists.freebsd.org/mailman/listinfo/freebsd-arch
    To unsubscribe, send any mail to "freebsd-arch-unsubscribe@freebsd.org"
    

  • Next message: Xin LI: "RFC: Why not move kernel MD code to sys/arch/?"