From: Erez Zadok <ezk@cs.sunysb.edu>
Date: Tue, 26 Jul 2005 03:31:08 +0000 (+0000)
Subject: * amd/amd.h (FSF_FORCE_UNMOUNT): new flag used to indicate that a
X-Git-Tag: am-utils-6_1_1~11
X-Git-Url: https://git.fsl.cs.sunysb.edu/?a=commitdiff_plain;h=8d5b03fe3c4a7cea709d5442dcc6c871bccad0aa;p=am-utils-6.1.git

* amd/amd.h (FSF_FORCE_UNMOUNT): new flag used to indicate that a
particular fserver may need forced/lazy unmounts when it's mntfs's
are unmounted.

* amd/ops_nfs.c (nfs_umount): a simple unmount returned EBUSY, and
the user specified forced_unmounts=yes in amd.conf, and this
fserver was flagged with FSF_FORCE_UNMOUNT, and the OS supports
forced/lazy unmounts, then try forced/lazy unmounts.  This should
allow a hung mount point to be removed.

* amd/srvr_nfs.c (find_nfs_srvr): move away IP-address change
detection code to its own function.
(check_fs_addr_change): new function to detect if the IP address
of a downed host has changed, and do various cleanups and fixups
to try and recover as best from that situation (e.g., flushing
various caches).  Also set the FSF_FORCE_UNMOUNT flag for the
fserver in question.
(flush_srvr_nfs_cache): pass fserver as argument, so we can
selectively flush the NFS cache for a single fserver (or all of
them, if you pass NULL).
---

diff --git a/ChangeLog b/ChangeLog
index 9b13be9..3d5c681 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,26 @@
 2005-07-25  Erez Zadok  <ezk@cs.sunysb.edu>
 
+	* amd/amd.h (FSF_FORCE_UNMOUNT): new flag used to indicate that a
+	particular fserver may need forced/lazy unmounts when it's mntfs's
+	are unmounted.
+
+	* amd/ops_nfs.c (nfs_umount): a simple unmount returned EBUSY, and
+	the user specified forced_unmounts=yes in amd.conf, and this
+	fserver was flagged with FSF_FORCE_UNMOUNT, and the OS supports
+	forced/lazy unmounts, then try forced/lazy unmounts.  This should
+	allow a hung mount point to be removed.
+
+	* amd/srvr_nfs.c (find_nfs_srvr): move away IP-address change
+	detection code to its own function.
+	(check_fs_addr_change): new function to detect if the IP address
+	of a downed host has changed, and do various cleanups and fixups
+	to try and recover as best from that situation (e.g., flushing
+	various caches).  Also set the FSF_FORCE_UNMOUNT flag for the
+	fserver in question.
+	(flush_srvr_nfs_cache): pass fserver as argument, so we can
+	selectively flush the NFS cache for a single fserver (or all of
+	them, if you pass NULL).
+
 	* libamu/xutil.c (switch_to_logfile): truncate a regular-file log
 	file if user passed non-zero "truncate_log" flag.
 
diff --git a/amd/amd.h b/amd/amd.h
index 1316d40..19fa916 100644
--- a/amd/amd.h
+++ b/amd/amd.h
@@ -37,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *
- * $Id: amd.h,v 1.68 2005/07/26 01:48:13 ezk Exp $
+ * $Id: amd.h,v 1.69 2005/07/26 03:31:08 ezk Exp $
  *
  */
 
@@ -140,6 +140,7 @@
 #define	FSF_PINGING	0x0010	/* Already doing pings */
 #define	FSF_WEBNFS	0x0020	/* Don't try to contact portmapper */
 #define FSF_PING_UNINIT	0x0040	/* ping values have not been initilized */
+#define FSF_FORCE_UNMOUNT 0x0080 /* force umount of this fserver */
 #define	FSRV_ERROR(fs)	((fs) && (((fs)->fs_flags & FSF_ERROR) == FSF_ERROR))
 #define	FSRV_ISDOWN(fs)	((fs) && (((fs)->fs_flags & (FSF_DOWN|FSF_VALID)) == (FSF_DOWN|FSF_VALID)))
 #define	FSRV_ISUP(fs)	(!(fs) || (((fs)->fs_flags & (FSF_DOWN|FSF_VALID)) == (FSF_VALID)))
@@ -534,7 +535,7 @@ extern void amfs_mkcacheref(mntfs *mf);
 extern int amfs_mount(am_node *mp, mntfs *mf, char *opts);
 extern void assign_error_mntfs(am_node *mp);
 extern am_node *next_nonerror_node(am_node *xp);
-extern void flush_srvr_nfs_cache(void);
+extern void flush_srvr_nfs_cache(fserver *fs);
 extern void am_mounted(am_node *);
 extern void mf_mounted(mntfs *mf, bool_t call_free_opts);
 extern void am_unmounted(am_node *);
diff --git a/amd/amq_subr.c b/amd/amq_subr.c
index 5855b0b..ef15416 100644
--- a/amd/amq_subr.c
+++ b/amd/amq_subr.c
@@ -37,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *
- * $Id: amq_subr.c,v 1.20 2005/07/26 01:48:13 ezk Exp $
+ * $Id: amq_subr.c,v 1.21 2005/07/26 03:31:08 ezk Exp $
  *
  */
 /*
@@ -154,8 +154,8 @@ amqproc_setopt_1_svc(voidp argp, struct svc_req *rqstp)
     if (amd_state == Run) {
       plog(XLOG_INFO, "amq says flush cache");
       do_mapc_reload = 0;
-      flush_nfs_fhandle_cache((fserver *) 0);
-      flush_srvr_nfs_cache();
+      flush_nfs_fhandle_cache((fserver *) NULL);
+      flush_srvr_nfs_cache((fserver *) NULL);
     }
     break;
   }
diff --git a/amd/ops_nfs.c b/amd/ops_nfs.c
index a22972a..e81e3af 100644
--- a/amd/ops_nfs.c
+++ b/amd/ops_nfs.c
@@ -37,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *
- * $Id: ops_nfs.c,v 1.44 2005/07/20 03:32:30 ezk Exp $
+ * $Id: ops_nfs.c,v 1.45 2005/07/26 03:31:08 ezk Exp $
  *
  */
 
@@ -304,7 +304,7 @@ flush_nfs_fhandle_cache(fserver *fs)
   fh_cache *fp;
 
   ITER(fp, fh_cache, &fh_head) {
-    if (fp->fh_fs == fs || fs == 0) {
+    if (fp->fh_fs == fs || fs == NULL) {
       /*
        * Only invalidate port info for non-WebNFS servers
        */
@@ -926,8 +926,24 @@ static int
 nfs_umount(am_node *am, mntfs *mf)
 {
   int unmount_flags = (mf->mf_flags & MFF_ON_AUTOFS) ? AMU_UMOUNT_AUTOFS : 0;
+  int new_unmount_flags;
   int error = UMOUNT_FS(mf->mf_mount, mnttab_file_name, unmount_flags);
 
+#if defined(HAVE_UMOUNT2) && (defined(MNT2_GEN_OPT_FORCE) || defined(MNT2_GEN_OPT_DETACH))
+  /*
+   * If the attempt to unmount failed with EBUSY, and this fserver was
+   * marked for forced unmounts, then use forced/lazy unmounts.
+   */
+  if (error == EBUSY &&
+      gopt.flags & CFM_FORCED_UNMOUNTS &&
+      mf->mf_server->fs_flags & FSF_FORCE_UNMOUNT) {
+    plog(XLOG_INFO, "EZK: nfs_umount: trying forced/lazy unmounts");
+    mf->mf_server->fs_flags &= ~FSF_FORCE_UNMOUNT; /* XXX: incorrect */
+    new_unmount_flags = unmount_flags | AMU_UMOUNT_FORCE | AMU_UMOUNT_DETACH;
+    error = UMOUNT_FS(mf->mf_mount, mnttab_file_name, new_unmount_flags);
+  }
+#endif /* HAVE_UMOUNT2 && (MNT2_GEN_OPT_FORCE || MNT2_GEN_OPT_DETACH) */
+
   /*
    * Here is some code to unmount 'restarted' file systems.
    * The restarted file systems are marked as 'nfs', not
@@ -955,7 +971,7 @@ nfs_umount(am_node *am, mntfs *mf)
 
       if (NSTREQ(mf->mf_mount, new_mf->mf_mount, len) &&
 	  new_mf->mf_mount[len] == '/') {
-	int new_unmount_flags =
+	new_unmount_flags =
 	  (new_mf->mf_flags & MFF_ON_AUTOFS) ? AMU_UMOUNT_AUTOFS : 0;
 	UMOUNT_FS(new_mf->mf_mount, mnttab_file_name, new_unmount_flags);
 	didsome = 1;
diff --git a/amd/srvr_nfs.c b/amd/srvr_nfs.c
index 2e8dadf..9b5f95c 100644
--- a/amd/srvr_nfs.c
+++ b/amd/srvr_nfs.c
@@ -37,7 +37,7 @@
  * SUCH DAMAGE.
  *
  *
- * $Id: srvr_nfs.c,v 1.41 2005/07/11 01:55:28 ezk Exp $
+ * $Id: srvr_nfs.c,v 1.42 2005/07/26 03:31:08 ezk Exp $
  *
  */
 
@@ -114,18 +114,20 @@ static void nfs_keepalive(voidp);
 
 
 /*
- * Flush any cached data
+ * Flush cached data for an fserver (or for all, if fs==NULL)
  */
 void
-flush_srvr_nfs_cache(void)
+flush_srvr_nfs_cache(fserver *fs)
 {
-  fserver *fs = 0;
+  fserver *fs2 = NULL;
 
-  ITER(fs, fserver, &nfs_srvr_list) {
-    nfs_private *np = (nfs_private *) fs->fs_private;
-    if (np) {
-      np->np_mountd_inval = TRUE;
-      np->np_error = -1;
+  ITER(fs2, fserver, &nfs_srvr_list) {
+    if (fs == NULL || fs == fs2) {
+      nfs_private *np = (nfs_private *) fs2->fs_private;
+      if (np) {
+	np->np_mountd_inval = TRUE;
+	np->np_error = -1;
+      }
     }
   }
 }
@@ -436,6 +438,47 @@ nfs_keepalive_callback(voidp pkt, int len, struct sockaddr_in *sp, struct sockad
 }
 
 
+static void
+check_fs_addr_change(fserver *fs)
+{
+  struct hostent *hp = NULL;
+  struct in_addr ia;
+  char *old_ipaddr, *new_ipaddr;
+  //  nfs_private *np = (nfs_private *) fs->fs_private;
+  EZKDBG;
+
+  hp = gethostbyname(fs->fs_host);
+  if (!hp ||
+      hp->h_addrtype != AF_INET ||
+      !STREQ((char *) hp->h_name, fs->fs_host) ||
+      memcmp((voidp) &fs->fs_ip->sin_addr,
+	     (voidp) hp->h_addr,
+	     sizeof(fs->fs_ip->sin_addr)) == 0)
+    return;
+  /* if got here: downed server changed IP address */
+  old_ipaddr = strdup(inet_ntoa(fs->fs_ip->sin_addr));
+  memmove((voidp) &ia, (voidp) hp->h_addr, sizeof(struct in_addr));
+  new_ipaddr = inet_ntoa(ia);	/* ntoa uses static buf */
+  plog(XLOG_WARNING, "EZK: down fileserver %s changed ip: %s -> %s",
+       fs->fs_host, old_ipaddr, new_ipaddr);
+  XFREE(old_ipaddr);
+  /* copy new IP addr */
+  memmove((voidp) &fs->fs_ip->sin_addr,
+	  (voidp) hp->h_addr,
+	  sizeof(fs->fs_ip->sin_addr));
+  /* XXX: are any of these correct?! */
+  fs->fs_flags &= ~FSF_DOWN;
+  fs->fs_flags |= FSF_VALID | FSF_WANT;
+  map_flush_srvr(fs);		/* XXX: a race with flush_srvr_nfs_cache? */
+  flush_srvr_nfs_cache(fs);
+  fs->fs_flags |= FSF_FORCE_UNMOUNT;
+
+#if 0
+  flush_nfs_fhandle_cache(fs);	/* done in caller: nfs_keepalive_timeout */
+  // XXX: need to purge nfs_private so that somehow it will get re-initialized
+#endif
+}
+
 /*
  * Called when no ping-reply received
  */
@@ -477,6 +520,7 @@ nfs_keepalive_timeout(voidp v)
        */
       flush_nfs_fhandle_cache(fs);
       np->np_error = -1;
+      check_fs_addr_change(fs);	/* check if IP addr of fserver changed */
     } else {
       /*
        * Known to be down
@@ -744,35 +788,11 @@ find_nfs_srvr(mntfs *mf)
   /*
    * This may not be the best way to do things, but it really doesn't make
    * sense to query a file server which is marked as 'down' for any
-   * version/proto combination: so just return that 'downed' server if it
-   * matched.  We also check here if by any chance, the IP address of the
-   * server was changed; this happens when NFS servers are migrated, or a
-   * temporary server is made available for one that failed.
+   * version/proto combination.
    */
   ITER(fs, fserver, &nfs_srvr_list) {
-    if (!FSRV_ISDOWN(fs) || !STREQ(host, fs->fs_host))
-      continue;
-    if (memcmp((voidp) &fs->fs_ip->sin_addr,
-	       (voidp) &ip->sin_addr,
-	       sizeof(ip->sin_addr)) != 0) {
-      /* IP address of downed server has changed! */
-      char *old_ipaddr = strdup(inet_ntoa(fs->fs_ip->sin_addr));
-      char *new_ipaddr = inet_ntoa(ip->sin_addr); /* ntoa uses static buf */
-      plog(XLOG_WARNING, "down fileserver %s changed ip: %s -> %s",
-	   host, old_ipaddr, new_ipaddr);
-      XFREE(old_ipaddr);
-      /* Now fix the fserver to the new IP */
-      dlog("resetting fileserver %s to ip %s (flags: valid, not down)",
-	   host, new_ipaddr);
-      memmove((voidp) &fs->fs_ip->sin_addr,
-	      (voidp) &ip->sin_addr,
-	      sizeof(ip->sin_addr));
-      fs->fs_flags |= FSF_VALID;
-      fs->fs_flags &= ~(FSF_DOWN|FSF_ERROR);
-      flush_nfs_fhandle_cache(fs); /* XXX: safer, but really needed? */
-      /* fall through to checking available NFS protocols, pinging, etc. */
-    } else {
-      /* server was down and is still down.  Not much we can do. */
+    if (FSRV_ISDOWN(fs) &&
+	STREQ(host, fs->fs_host)) {
       plog(XLOG_WARNING, "fileserver %s is already hung - not running NFS proto/version discovery", host);
       fs->fs_refc++;
       if (ip)