NFS client/buffer cache deadlock

From: Brian Fundakowski Feldman (green_at_freebsd.org)
Date: 04/15/05

  • Next message: Andre Guibert de Bruet: "Re: LOR: vm page queue mutex -> vnode interlock"
    Date: Fri, 15 Apr 2005 01:08:21 -0400
    To: hackers@freebsd.org, current@freebsd.org
    
    

    I'll spare a lengthy write-up because I think the patch documents it well
    enough. It certainly appears to fix things here when doing very large
    block-sized writes, but it also reduces the throughput with those block
    sizes. (I don't think there should be any difference when using reasonable
    block sizes).

    Would anyone care to take a shot at fixing it in a more elegant manner?

    Index: sys/buf.h
    ===================================================================
    RCS file: /export/ncvs/src/sys/sys/buf.h,v
    retrieving revision 1.167.2.1
    diff -u -r1.167.2.1 buf.h
    --- sys/buf.h 31 Jan 2005 23:26:55 -0000 1.167.2.1
    +++ sys/buf.h 15 Apr 2005 02:00:44 -0000
    @@ -469,6 +469,7 @@
     extern int maxswzone; /* Max KVA for swap structures */
     extern int maxbcache; /* Max KVA for buffer cache */
     extern int runningbufspace;
    +extern int hibufspace;
     extern int buf_maxio; /* nominal maximum I/O for buffer */
     extern struct buf *buf; /* The buffer headers. */
     extern char *buffers; /* The buffer contents. */
    Index: kern/vfs_bio.c
    ===================================================================
    RCS file: /export/ncvs/src/sys/kern/vfs_bio.c,v
    retrieving revision 1.444.2.2
    diff -u -r1.444.2.2 vfs_bio.c
    --- kern/vfs_bio.c 31 Jan 2005 23:26:18 -0000 1.444.2.2
    +++ kern/vfs_bio.c 15 Apr 2005 01:59:38 -0000
    @@ -113,7 +113,7 @@
     static int lobufspace;
     SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
         "Minimum amount of buffers we want to have");
    -static int hibufspace;
    +int hibufspace;
     SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
         "Maximum allowed value of bufspace (excluding buf_daemon)");
     static int bufreusecnt;
    Index: nfsclient/nfs_bio.c
    ===================================================================
    RCS file: /export/ncvs/src/sys/nfsclient/nfs_bio.c,v
    retrieving revision 1.133.2.2
    diff -u -r1.133.2.2 nfs_bio.c
    --- nfsclient/nfs_bio.c 31 Jan 2005 23:26:46 -0000 1.133.2.2
    +++ nfsclient/nfs_bio.c 15 Apr 2005 04:41:13 -0000
    @@ -726,6 +726,7 @@
             struct vattr vattr;
             struct nfsmount *nmp = VFSTONFS(vp->v_mount);
             daddr_t lbn;
    + off_t commitleft;
             int bcount;
             int n, on, error = 0;
             int haverslock = 0;
    @@ -755,6 +756,7 @@
              */
             if (ioflag & (IO_APPEND | IO_SYNC)) {
                     if (np->n_flag & NMODIFIED) {
    +flush_and_restart:
                             np->n_attrstamp = 0;
                             error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
                             if (error)
    @@ -832,12 +834,65 @@
             }
     
             biosize = vp->v_mount->mnt_stat.f_iosize;
    + commitleft = 0;
    + /*
    + * If there are possible modifications, then there may be some
    + * B_NEEDCOMMIT buffers. Total those up here and force a flush
    + * before starting to write if our writes can exceed the local
    + * maximum per-file write commit size.
    + *
    + * If there are no possible pending modifications, we still need
    + * to limit our write to that size.
    + */
    + if ((ioflag & (IO_SYNC | IO_INVAL)) != (IO_SYNC | IO_INVAL)) {
    + commitleft = nmp->nm_wcommitsize;
    + if (np->n_flag & NMODIFIED) {
    + int wouldcommit = 0;
    + VI_LOCK(vp);
    + TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
    + if (bp->b_flags & B_NEEDCOMMIT)
    + wouldcommit += bp->b_bcount;
    + }
    + VI_UNLOCK(vp);
    + /*
    + * Since we're not operating synchronously and
    + * bypassing the buffer cache, we are in a commit
    + * and holding all of these buffers whether
    + * transmitted or not. If not limited, this
    + * will lead to the buffer cache deadlocking,
    + * as no one else can flush our uncommitted buffers.
    + */
    + wouldcommit += uio->uio_resid;
    + /*
    + * If we would initially exceed the maximum
    + * outstanding write commit size, flush and restart.
    + */
    + if (wouldcommit > commitleft) {
    + if (haverslock) {
    + nfs_rsunlock(np, td);
    + haverslock = 0;
    + }
    + goto flush_and_restart;
    + }
    + } else {
    + /*
    + * With no outstanding commits, we are limited only
    + * by commitleft as to how far we can go.
    + */
    + }
    + }
     
             do {
                     nfsstats.biocache_writes++;
                     lbn = uio->uio_offset / biosize;
                     on = uio->uio_offset & (biosize-1);
                     n = min((unsigned)(biosize - on), uio->uio_resid);
    + /* Always allow at least one write. */
    + if (commitleft > 0) {
    + commitleft -= n;
    + if (commitleft == 0)
    + commitleft = -1;
    + }
     again:
                     /*
                      * Handle direct append and file extension cases, calculate
    @@ -932,12 +987,6 @@
                                     break;
                             }
                     }
    - if (!bp) {
    - error = nfs_sigintr(nmp, NULL, td);
    - if (!error)
    - error = EINTR;
    - break;
    - }
                     if (bp->b_wcred == NOCRED)
                             bp->b_wcred = crhold(cred);
                     np->n_flag |= NMODIFIED;
    @@ -1036,7 +1085,7 @@
                     } else {
                             bdwrite(bp);
                     }
    - } while (uio->uio_resid > 0 && n > 0);
    + } while (uio->uio_resid > 0 && n > 0 && commitleft >= 0);
     
             if (haverslock)
                     nfs_rsunlock(np, td);
    Index: nfsclient/nfs_vfsops.c
    ===================================================================
    RCS file: /export/ncvs/src/sys/nfsclient/nfs_vfsops.c,v
    retrieving revision 1.158.2.3
    diff -u -r1.158.2.3 nfs_vfsops.c
    --- nfsclient/nfs_vfsops.c 31 Jan 2005 23:26:46 -0000 1.158.2.3
    +++ nfsclient/nfs_vfsops.c 15 Apr 2005 02:03:05 -0000
    @@ -41,6 +41,8 @@
     #include <sys/param.h>
     #include <sys/systm.h>
     #include <sys/kernel.h>
    +#include <sys/bio.h>
    +#include <sys/buf.h>
     #include <sys/limits.h>
     #include <sys/lock.h>
     #include <sys/malloc.h>
    @@ -625,6 +627,12 @@
                     else
                             nmp->nm_readahead = NFS_MAXRAHEAD;
             }
    + if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) {
    + if (argp->wcommitsize < nmp->nm_wsize)
    + nmp->nm_wcommitsize = nmp->nm_wsize;
    + else
    + nmp->nm_wcommitsize = argp->wcommitsize;
    + }
             if ((argp->flags & NFSMNT_DEADTHRESH) && argp->deadthresh >= 0) {
                     if (argp->deadthresh <= NFS_MAXDEADTHRESH)
                             nmp->nm_deadthresh = argp->deadthresh;
    @@ -785,6 +793,7 @@
                     nmp->nm_wsize = NFS_WSIZE;
                     nmp->nm_rsize = NFS_RSIZE;
             }
    + nmp->nm_wcommitsize = hibufspace / (desiredvnodes / 1000);
             nmp->nm_readdirsize = NFS_READDIRSIZE;
             nmp->nm_numgrps = NFS_MAXGRPS;
             nmp->nm_readahead = NFS_DEFRAHEAD;
    Index: nfsclient/nfsargs.h
    ===================================================================
    RCS file: /export/ncvs/src/sys/nfsclient/nfsargs.h,v
    retrieving revision 1.66.2.1
    diff -u -r1.66.2.1 nfsargs.h
    --- nfsclient/nfsargs.h 31 Jan 2005 23:26:46 -0000 1.66.2.1
    +++ nfsclient/nfsargs.h 15 Apr 2005 01:33:08 -0000
    @@ -56,7 +56,7 @@
             int retrans; /* times to retry send */
             int maxgrouplist; /* Max. size of group list */
             int readahead; /* # of blocks to readahead */
    - int __pad1; /* was "leaseterm" */
    + int wcommitsize; /* Max. write commit size in bytes */
             int deadthresh; /* Retrans threshold */
             char *hostname; /* server's name */
             int acregmin; /* cache attrs for reg files min time */
    @@ -80,7 +80,7 @@
     #define NFSMNT_NFSV3 0x00000200 /* Use NFS Version 3 protocol */
     /* 0x400 free, was NFSMNT_KERB */
     #define NFSMNT_DUMBTIMR 0x00000800 /* Don't estimate rtt dynamically */
    -/* 0x1000 free, was NFSMNT_LEASETERM */
    +#define NFSMNT_WCOMMITSIZE 0x00001000 /* set max write commit size */
     #define NFSMNT_READAHEAD 0x00002000 /* set read ahead */
     #define NFSMNT_DEADTHRESH 0x00004000 /* set dead server retry thresh */
     #define NFSMNT_RESVPORT 0x00008000 /* Allocate a reserved port */
    Index: nfsclient/nfsmount.h
    ===================================================================
    RCS file: /export/ncvs/src/sys/nfsclient/nfsmount.h,v
    retrieving revision 1.27.2.1
    diff -u -r1.27.2.1 nfsmount.h
    --- nfsclient/nfsmount.h 31 Jan 2005 23:26:46 -0000 1.27.2.1
    +++ nfsclient/nfsmount.h 15 Apr 2005 01:21:57 -0000
    @@ -66,6 +66,7 @@
             int nm_wsize; /* Max size of write rpc */
             int nm_readdirsize; /* Size of a readdir rpc */
             int nm_readahead; /* Num. of blocks to readahead */
    + int nm_wcommitsize; /* Max size of commit for write */
             int nm_acdirmin; /* Directory attr cache min lifetime */
             int nm_acdirmax; /* Directory attr cache max lifetime */
             int nm_acregmin; /* Reg file attr cache min lifetime */

    -- 
    Brian Fundakowski Feldman                           \'[ FreeBSD ]''''''''''\
      <> green@FreeBSD.org                               \  The Power to Serve! \
     Opinions expressed are my own.                       \,,,,,,,,,,,,,,,,,,,,,,\
    _______________________________________________
    freebsd-current@freebsd.org mailing list
    http://lists.freebsd.org/mailman/listinfo/freebsd-current
    To unsubscribe, send any mail to "freebsd-current-unsubscribe@freebsd.org"
    

  • Next message: Andre Guibert de Bruet: "Re: LOR: vm page queue mutex -> vnode interlock"

    Relevant Pages