pam_radius_auth: simultaneous requests for improved failover behaviour

Stefan Winter stefan.winter at restena.lu
Mon Apr 28 11:40:37 CEST 2008


-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Hello list,

we have been using pam_radius_auth for numerous services. They are in a
failover scenario against two FreeRADIUS servers with a sane timeout to
trigger querying the second host.
Standard failover works sufficiently well for most services. There are
some, however, that die horribly when the primary server isn't
responding. One such example is Horde IMP: it needs six consecutive
authentications for login: the first for access verification on the web
frontend, and another five IMAP logins to retrieve quota information,
mail index, folder list, whatnot.
Even small timeouts get too large when having them * 6. It gets worse
because Horde opens its database connection in the very beginning and
blocks its web server thread until all six authentications have been
done, and it thereby effectively blocks lots of SQL threads for a long time.
In short: using failover based on timeout wrecks web server and
database, even though conceptually, everything should be fine from the
RADIUS side.

A colleague of mine, Bruno Prémont, has created a patch to
pam_radius_auth which allows to configure parallelity for user
authentication requests. Basically, it can be used to fire auth requests
to all servers in the list simultaneously and takes the first answer it
gets. This is effectively a 0-delay failover. It can also be used for a
more cautious behaviour of waiting non-zero, but less time than timeout
for a server's answer.

The patch is on the tracker under #550
(http://bugs.freeradius.org/show_bug.cgi?id=550)
and inline at the end of this mail.

Compile tests:
~  x86_32
~  x86_64
Runtime tested:
~  x86_32
~  x86_64

I know that this is kindof hacky. It introduces double load on the
backend servers. It does appear to solve our problem though. :-)

I'm also aware that it will skew statistics, because from a server's
view, twice as many people are authenticated. For us it's a minor thing,
but if there's interest, this could be mitigated by introducing a unique
attribute into all the requests that belong to a single attempt. That
shouldn't be too hard to do.

Greetings,

Stefan Winter

========= part 1 (fix compiler warnings) ========

diff -NurpP pam_radius-1.3.17.orig/pam_radius_auth.c
pam_radius-1.3.17/pam_radius_auth.c
- --- pam_radius-1.3.17.orig/pam_radius_auth.c	2007-03-26
11:36:13.000000000 +0200
+++ pam_radius-1.3.17/pam_radius_auth.c	2008-04-24 14:57:34.928405141 +0200
@@ -766,7 +766,8 @@ static int
~ talk_radius(radius_conf_t *conf, AUTH_HDR *request, AUTH_HDR *response,
~             char *password, char *old_password, int tries)
~ {
- -  int salen, total_length;
+  socklen_t salen;
+  int total_length;
~   fd_set set;
~   struct timeval tv;
~   time_t now, end;
@@ -1099,7 +1100,7 @@ pam_sm_authenticate(pam_handle_t *pamh,i
~     DPRINT(LOG_DEBUG, "Got PAM_RUSER name %s", userinfo);

~     if (!strcmp("root", user)) {
- -      user = userinfo;
+      user = *userinfo;
~       DPRINT(LOG_DEBUG, "Username now %s from ruser", user);
~     } else {
~       DPRINT(LOG_DEBUG, "Skipping ruser for non-root auth");

========= part 2 (introduce pdelay) ========

diff -NurpP pam_radius-1.3.17.orig/pam_radius_auth.c
pam_radius-1.3.17/pam_radius_auth.c
- --- pam_radius-1.3.17.orig/pam_radius_auth.c	2007-03-26
11:36:13.000000000 +0200
+++ pam_radius-1.3.17/pam_radius_auth.c	2008-04-24 14:57:34.928405141 +0200
@@ -591,6 +591,7 @@ initialize(radius_conf_t *conf, int acco
~   radius_server_t *server = NULL;
~   struct sockaddr_in * s_in;
~   int timeout;
+  int pdelay;
~   int line = 0;

~   /* the first time around, read the configuration file */
@@ -621,7 +622,8 @@ initialize(radius_conf_t *conf, int acco
~     }

~     timeout = 3;
- -    if (sscanf(p, "%s %s %d", hostname, secret, &timeout) < 2) {
+    pdelay = -1;
+    if (sscanf(p, "%s %s %d %d", hostname, secret, &timeout, &pdelay) <
2) {
~       _pam_log(LOG_ERR, "ERROR reading %s, line %d: Could not read
hostname or secret\n",
~ 	       conf_file, line);
~       continue; /* invalid line */
@@ -648,6 +650,7 @@ initialize(radius_conf_t *conf, int acco
~       } else {
~ 	server->timeout = timeout;
~       }
+      server->pdelay = pdelay < -1 ? -1 : (pdelay > 600 ? 600 : pdelay);
~       server->next = NULL;
~     }
~   }
@@ -666,6 +669,13 @@ initialize(radius_conf_t *conf, int acco
~     return PAM_AUTHINFO_UNAVAIL;
~   }

+  pdelay = fcntl(conf->sockfd, F_GETFL);
+  if (pdelay == -1 || fcntl(conf->sockfd, F_SETFL, pdelay | O_NONBLOCK)
== -1) {
+    _pam_log(LOG_ERR, "Failed to flag RADIUS socket O_NONBLOCK: %s\n",
+	     strerror(errno));
+    return PAM_AUTHINFO_UNAVAIL;
+  }
+
~   /* set up the local end of the socket communications */
~   s_in = (struct sockaddr_in *) &salocal;
~   memset ((char *) s_in, '\0', sizeof(struct sockaddr));
@@ -770,15 +780,24 @@ static int
~   int total_length;
~   fd_set set;
~   struct timeval tv;
- -  time_t now, end;
+  time_t now;
~   int rcode;
~   struct sockaddr saremote;
~   struct sockaddr_in *s_in = (struct sockaddr_in *) &saremote;
- -  radius_server_t *server = conf->server;
- -  int ok;
- -  int server_tries;
+  radius_server_t *server_prev, *server = conf->server;
+  int server_left = 0;
~   int retval;

+  /*
+    Cleanup our server structs
+    */
+  for (server = conf->server; server; server = server->next) {
+    server->attempts = tries;
+    server->sendtime = 0;
+    server_left++;
+    server->reqid = request->vector[server_left % AUTH_VECTOR_LEN];
+  }
+
~   /* ************************************************************ */
~   /* Now that we're done building the request, we can send it */

@@ -791,214 +810,235 @@ talk_radius(radius_conf_t *conf, AUTH_HD
~     sequence, on the off chance that one may have ended up fixing itself.

~     */
- -
- -  /* loop over all available servers */
- -  while (server != NULL) {

- -    /* only look up IP information as necessary */
- -    if ((retval = host2server(server)) != PAM_SUCCESS) {
- -      _pam_log(LOG_ERR,
- -	       "Failed looking up IP address for RADIUS server %s (errcode=%d)",
- -	       server->hostname, retval);
- -      ok = FALSE;
- -      goto next;		/* skip to the next server */
- -    }
- -
- -    /* set up per-server IP && port configuration */
- -    memset ((char *) s_in, '\0', sizeof(struct sockaddr));
- -    s_in->sin_family = AF_INET;
- -    s_in->sin_addr.s_addr = htonl(server->ip.s_addr);
- -    s_in->sin_port = server->port;
- -    total_length = ntohs(request->length);
- -
- -    if (!password) { 		/* make an RFC 2139 p6 request authenticator */
- -      get_accounting_vector(request, server);
- -    }
- -
- -    server_tries = tries;
- -send:
- -    /* send the packet */
- -    if (sendto(conf->sockfd, (char *) request, total_length, 0,
- -	       &saremote, sizeof(struct sockaddr_in)) < 0) {
- -      _pam_log(LOG_ERR, "Error sending RADIUS packet to server %s: %s",
- -	       server->hostname, strerror(errno));
- -      ok = FALSE;
- -      goto next;		/* skip to the next server */
- -    }
- -
- -    /* ************************************************************ */
- -    /* Wait for the response, and verify it. */
- -    salen = sizeof(struct sockaddr);
- -    tv.tv_sec = server->timeout; /* wait for the specified time */
+  /* loop over all available servers */
+  while (server_left > 0) {
+    tv.tv_sec = 60;
~     tv.tv_usec = 0;
- -    FD_ZERO(&set);		/* clear out the set */
- -    FD_SET(conf->sockfd, &set);	/* wait only for the RADIUS UDP socket */
- -
- -    time(&now);
- -    end = now + tv.tv_sec;
+    server = conf->server;
+    server_prev = NULL;

- -    /* loop, waiting for the select to return data */
- -    ok = TRUE;
- -    while (ok) {
- -
- -      rcode = select(conf->sockfd + 1, &set, NULL, NULL, &tv);
- -
- -      /* select timed out */
- -      if (rcode == 0) {
- -	_pam_log(LOG_ERR, "RADIUS server %s failed to respond",
- -		 server->hostname);
- -	if (--server_tries)
- -	  goto send;
- -	ok = FALSE;
- -	break;			/* exit from the select loop */
- -      } else if (rcode < 0) {
- -
- -	/* select had an error */
- -	if (errno == EINTR) {	/* we were interrupted */
+    while (server) {
+      time(&now);
+
+      /* Attempt this server if we don't have to wait */
+      if (server_prev && server_prev->attempts >= 0 &&
server_prev->sendtime + server->pdelay > now) {
+	// reduce tv.tv_sec if needed
+	if (tv.tv_sec > server_prev->sendtime + server->pdelay - now)
+	  tv.tv_sec = server_prev->sendtime + server->pdelay - now;
+	goto receive;
+      } else if (server->attempts >= 0 && server->sendtime +
server->timeout > now) {
+	if (tv.tv_sec > server->timeout)
+	  tv.tv_sec = server->timeout;
+	if (tv.tv_sec > server->timeout + server->sendtime - now)
+	  tv.tv_sec = server->timeout + server->sendtime - now;
+	goto skip;
+      } else if (server->attempts >= 0 && server->timeout < tv.tv_sec)
+	tv.tv_sec = server->timeout;
+
+      if (server->sendtime == 0) {
+	if ((retval = host2server(server)) != PAM_SUCCESS) {
+	  _pam_log(LOG_ERR,
+		 "Failed looking up IP address for RADIUS server %s (errcode=%d)",
+		 server->hostname, retval);
+	  server->attempts = -1;
+	  server_left--;
+	  goto skip;
+	} else {
+	  server->sendtime = now;
~ 	  time(&now);
- -	
- -	  if (now > end) {
- -	    _pam_log(LOG_ERR, "RADIUS server %s failed to respond",
- -		     server->hostname);
- -	    if (--server_tries)
- -	      goto send;
- -	    ok = FALSE;
- -	    break;			/* exit from the select loop */
- -	  }
- -	
- -	  tv.tv_sec = end - now;
- -	  if (tv.tv_sec == 0) {	/* keep waiting */
- -	    tv.tv_sec = 1;
- -	  }
+	  get_random_vector(server->vector);
+	}
+      }
+
+      if (server->attempts > 0) {
+	/* set up per-server IP && port configuration */
+	memset ((char *) s_in, '\0', sizeof(struct sockaddr));
+	s_in->sin_family = AF_INET;
+	s_in->sin_addr.s_addr = htonl(server->ip.s_addr);
+	s_in->sin_port = server->port;
+	total_length = ntohs(request->length);
+	/* fill in our packet */
+	memcpy(request->vector, server->vector, AUTH_VECTOR_LEN);
+	request->id = server->reqid;

- -	} else {		/* not an interrupt, it was a real error */
- -	  _pam_log(LOG_ERR, "Error waiting for response from RADIUS server %s:
%s",
+	if (!password) { 		/* make an RFC 2139 p6 request authenticator */
+	  get_accounting_vector(request, server);
+	} else {
+	  if (old_password) {	/* password change request */
+	    add_password(request, PW_PASSWORD, password, old_password);
+	    add_password(request, PW_OLD_PASSWORD, old_password, old_password);
+	  } else {		/* authentication request */
+	    add_password(request, PW_PASSWORD, password, server->secret);
+	  }
+	}
+
+	if (sendto(conf->sockfd, (char *) request, total_length, 0,
+		   &saremote, sizeof(struct sockaddr_in)) < 0) {
+	  _pam_log(LOG_ERR, "Error sending RADIUS packet to server %s: %s",
~ 		   server->hostname, strerror(errno));
- -	  ok = FALSE;
- -	  break;
+	  server->attempts = -1;
+	  server_left--;
+	  goto skip;
+	} else
+	  server->attempts--;
+      } else {
+	if (server->sendtime + server->timeout < now && server->attempts == 0) {
+	  _pam_log(LOG_ERR, "RADIUS server %s failed to respond",
+		   server->hostname);
+	  server->attempts = -1;
+	  server_left--;
~ 	}
+      }
+
+skip:
+      server_prev = server;
+      server = server->next;
+    }
+    goto receive;

- -	/* the select returned OK */
- -      } else if (FD_ISSET(conf->sockfd, &set)) {
- -
- -	/* try to receive some data */
- -	if ((total_length = recvfrom(conf->sockfd, (char *) response,
- -				     BUFFER_SIZE,
- -				     0, &saremote, &salen)) < 0) {
- -	  _pam_log(LOG_ERR, "error reading RADIUS packet from server %s: %s",
- -		   server->hostname, strerror(errno));
- -	  ok = FALSE;
+select:
+    if (server_left <= 0)
+      break;
+    /* select() on our socket */
+    FD_ZERO(&set);              /* clear out the set */
+    FD_SET(conf->sockfd, &set); /* wait only for the RADIUS UDP socket */
+    rcode = select(conf->sockfd + 1, &set, NULL, NULL, &tv);
+    if (rcode < 0) {
+      /* some error while calling select() */
+      if (errno == EINTR) {
+	/* we were interrupted */
+	continue;
+      } else {
+	/* not an interrupt, it was a real error */
+	_pam_log(LOG_ERR, "Error waiting for response from RADIUS server %s: %s",
+		 server->hostname, strerror(errno));
+	continue;
+      }
+    } else if (rcode > 0 && FD_ISSET(conf->sockfd, &set)) {
+      /* try to receive some data */
+receive:
+      salen = sizeof(struct sockaddr);
+      if ((total_length = recvfrom(conf->sockfd, (char *) response,
+				   BUFFER_SIZE,
+				   0, &saremote, &salen)) < 0) {
+	if (errno == EAGAIN)
+	  goto select;
+	_pam_log(LOG_ERR, "error reading RADIUS packet from server %s: %s",
+		 server->hostname, strerror(errno));
+	continue;
+      } else {
+	/* there's data, see if it's valid */
+	char *p;
+	
+	// check origin of response
+	for (server = conf->server; server; server = server->next) {
+	  if (server->sendtime == 0)
+	    continue;
+	  if (s_in->sin_family != AF_INET)
+	    continue;
+	  if (s_in->sin_port != server->port)
+	    continue;
+	  if (s_in->sin_addr.s_addr != htonl(server->ip.s_addr))
+	    continue;
~ 	  break;
+	}
+	if (server == NULL) {
+	  _pam_log(LOG_ERR, "RADIUS answer from martian source %s:%d",
inet_ntoa(s_in->sin_addr),
+	           s_in->sin_port);
+	  continue;
+	} else
+	  p = server->secret;
+	
+	if (total_length < 2 || (ntohs(response->length) != total_length) ||
+	    (ntohs(response->length) > BUFFER_SIZE)) {
+	  _pam_log(LOG_ERR, "RADIUS packet from server %s is corrupted",
+		   server->hostname);
+	  continue;
+	}
+
+	/* restore request to the state of resposne server */
+	memcpy(request->vector, server->vector, AUTH_VECTOR_LEN);
+	request->id = server->reqid;

- -	  /* there's data, see if it's valid */
+	if (!password) { 		/* make an RFC 2139 p6 request authenticator */
+	  get_accounting_vector(request, server);
~ 	} else {
- -	  char *p = server->secret;
- -	
- -	  if ((ntohs(response->length) != total_length) ||
- -	      (ntohs(response->length) > BUFFER_SIZE)) {
- -	    _pam_log(LOG_ERR, "RADIUS packet from server %s is corrupted",
- -		     server->hostname);
- -	    ok = FALSE;
- -	    break;
+	  if (old_password) {	/* password change request */
+	    add_password(request, PW_PASSWORD, password, old_password);
+	    add_password(request, PW_OLD_PASSWORD, old_password, old_password);
+	  } else {		/* authentication request */
+	    add_password(request, PW_PASSWORD, password, server->secret);
~ 	  }
+	}

- -	  /* Check if we have the data OK.  We should also check request->id */
- -
- -	  if (password) {
- -	    if (old_password) {
+	/* Check if we have the data OK.  We should also check request->id */
+	if (password) {
+	  if (old_password) {
~ #ifdef LIVINGSTON_PASSWORD_VERIFY_BUG_FIXED
- -	      p = old_password;	/* what it should be */
+	    p = old_password;	/* what it should be */
~ #else
- -	      p = "";		/* what it really is */
+	    p = "";		/* what it really is */
~ #endif
- -	    }
- -	    /*
- -	     * RFC 2139 p.6 says not do do this, but the Livingston 1.16
- -	     * server disagrees.  If the user says he wants the bug, give in.
- -	     */
- -	  } else {		/* authentication request */
- -	    if (conf->accounting_bug) {
- -	      p = "";
- -	    }
~ 	  }
- -	
- -	  if (!verify_packet(p, response, request)) {
- -	    _pam_log(LOG_ERR, "packet from RADIUS server %s fails
verification: The shared secret is probably incorrect.",
- -		     server->hostname);
- -	    ok = FALSE;
- -	    break;
- -	  }
- -
- -	  /*
- -	   * Check that the response ID matches the request ID.
+	  /*
+	   * RFC 2139 p.6 says not do do this, but the Livingston 1.16
+	   * server disagrees.  If the user says he wants the bug, give in.
~ 	   */
- -	  if (response->id != request->id) {
- -	    _pam_log(LOG_WARNING, "Response packet ID %d does not match the
request packet ID %d: verification of packet fails", response->id,
request->id);
- -	      ok = FALSE;
- -	    break;
+	} else {		/* authentication request */
+	  if (conf->accounting_bug) {
+	    p = "";
~ 	  }
~ 	}

+	if (!verify_packet(p, response, request)) {
+	  _pam_log(LOG_ERR, "packet from RADIUS server %s fails verification:
The shared secret is probably incorrect.",
+		   server->hostname);
+	  continue;
+	}
+	
+	/*
+	 * Check that the response ID matches the request ID.
+	 */
+	if (response->id != request->id) {
+	  _pam_log(LOG_WARNING, "Response packet ID %d does not match the
request packet ID %d: verification of packet fails", response->id,
request->id);
+	  continue;
+	}
+	
~ 	/*
~ 	 * Whew!  The select is done.  It hasn't timed out, or errored out.
~ 	 * It's our descriptor.  We've got some data.  It's the right size.
~ 	 * The packet is valid.
~ 	 * NOW, we can skip out of the select loop, and process the packet
~ 	 */
- -	break;
+	goto success;
~       }
- -      /* otherwise, we've got data on another descriptor, keep
select'ing */
~     }
+  }

- -    /* go to the next server if this one didn't respond */
- -  next:
- -    if (!ok) {
- -      radius_server_t *old;	/* forget about this server */
- -
- -      old = server;
- -      server = server->next;
- -      conf->server = server;
+  _pam_log(LOG_ERR, "All RADIUS servers failed to respond.");
+  if (conf->localifdown)
+    retval = PAM_IGNORE;
+  else
+    retval = PAM_AUTHINFO_UNAVAIL;
+  goto out;

- -      _pam_forget(old->secret);
- -      free(old->hostname);
- -      free(old);
- -
- -      if (server) {		/* if there's more servers to check */
- -	/* get a new authentication vector, and update the passwords */
- -	get_random_vector(request->vector);
- -	request->id = request->vector[0];
- -	
- -	/* update passwords, as appropriate */
- -	if (password) {
- -	  get_random_vector(request->vector);
- -	  if (old_password) {	/* password change request */
- -	    add_password(request, PW_PASSWORD, password, old_password);
- -	    add_password(request, PW_OLD_PASSWORD, old_password, old_password);
- -	  } else {		/* authentication request */
- -	    add_password(request, PW_PASSWORD, password, server->secret);
- -	  }
- -	}
+success:
+  retval = PAM_SUCCESS;
+  live_server = server;
+  if (conf->server != live_server) {
+    for (server = conf->server; server; server = server->next)
+      if (server->next == live_server) {
+	server->next = live_server->next;
+	break;
~       }
- -      continue;
- -
- -    } else {
- -      /* we've found one that does respond, forget about the other
servers */
- -      cleanup(server->next);
- -      server->next = NULL;
- -      live_server = server;	/* we've got a live one! */
- -      break;
- -    }
- -  }
- -
- -  if (!server) {
- -    _pam_log(LOG_ERR, "All RADIUS servers failed to respond.");
- -    if (conf->localifdown)
- -      retval = PAM_IGNORE;
- -    else
- -      retval = PAM_AUTHINFO_UNAVAIL;
- -  } else {
- -    retval = PAM_SUCCESS;
- -  }
+  } else
+    conf->server = live_server->next;
+  live_server->next = NULL;
+
+out:
+  cleanup(conf->server);
+  conf->server = NULL;

~   return retval;
~ }
diff -NurpP pam_radius-1.3.17.orig/pam_radius_auth.conf
pam_radius-1.3.17/pam_radius_auth.conf
- --- pam_radius-1.3.17.orig/pam_radius_auth.conf	2000-07-10
21:18:42.000000000 +0200
+++ pam_radius-1.3.17/pam_radius_auth.conf	2008-04-23 16:06:36.178404122
+0200
@@ -8,11 +8,13 @@
~ #  lines.  Blank lines or lines beginning with '#' are treated as
~ #  comments, and are ignored.  The fields are:
~ #
- -#  server[:port] secret [timeout]
+#  server[:port] secret [timeout] [parallel delay]
~ #
~ #  the port name or number is optional.  The default port name is
- -#  "radius", and is looked up from /etc/services The timeout field is
- -#  optional.  The default timeout is 3 seconds.
+#  "radius", and is looked up from /etc/services. The timeout field is
+#  optional.  The default timeout is 3 seconds. The paralell delay field
+#  is optional. The default paralell delay is -1 which means no,
+#  paralellism, maximum is timeout is 1 minute (60)
~ #
~ #  If multiple RADIUS server lines exist, they are tried in order.  The
~ #  first server to return success or failure causes the module to return
@@ -22,9 +24,9 @@
~ #  The timeout field controls how many seconds the module waits before
~ #  deciding that the server has failed to respond.
~ #
- -# server[:port]	shared_secret      timeout (s)
+# server[:port]	shared_secret      timeout (s)  pdelay (s)
~ 127.0.0.1	secret             1
- -other-server    other-secret       3
+other-server    other-secret       3            1

~ #
~ # having localhost in your radius configuration is a Good Thing.
diff -NurpP pam_radius-1.3.17.orig/pam_radius_auth.h
pam_radius-1.3.17/pam_radius_auth.h
- --- pam_radius-1.3.17.orig/pam_radius_auth.h	2007-03-26
07:35:31.000000000 +0200
+++ pam_radius-1.3.17/pam_radius_auth.h	2008-04-24 11:27:33.009599859 +0200
@@ -16,6 +16,7 @@
~ #include <utmp.h>
~ #include <time.h>
~ #include <netinet/in.h>
+#include <arpa/inet.h>
~ #include <netdb.h>
~ #include <fcntl.h>

@@ -41,7 +42,12 @@ typedef struct radius_server_t {
~   char *hostname;
~   char *secret;
~   int timeout;
+  int pdelay;
~   int accounting;
+  time_t sendtime;
+  int attempts;
+  u_char reqid;
+  u_char vector[AUTH_VECTOR_LEN];
~ } radius_server_t;

~ typedef struct radius_conf_t {
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2.0.4-svn0 (GNU/Linux)
Comment: Using GnuPG with SUSE - http://enigmail.mozdev.org

iD8DBQFIFZuV+jm90f8eFWYRApBHAJ4nauU3k6sg47qTHNmRhjwJdT5TNwCfakKc
4IUxngbm1X7/M3TmPNwpG+I=
=6mdH
-----END PGP SIGNATURE-----



More information about the Freeradius-Devel mailing list