Re: [PATCH] multipath: fix scsi async tur checker corruption

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 12/21/2011 12:01 AM, Benjamin Marzinski wrote:
> Since the tur checker runs asynchronously in its own thread, there is nothing
> that keeps a path from being orphaned or deleted before the tur thread has
> finished. When this happenes the checker struct gets deleted.  However, the tur
> thread might still we writing to that memory.  This can lead to memory
> corruption.  This patch adds all of the necessary data to the checker context,
> and makes the tur thread only use that. This way, if the checker is deleted
> while the thread is still using the context, the thread will clean up the
> context itself.
> 
> Signed-off-by: Benjamin Marzinski <bmarzins@xxxxxxxxxx>
> ---
>  libmultipath/checkers/tur.c |   85 +++++++++++++++++++++++++++++++++-----------
>  libmultipath/discovery.c    |    1 
>  2 files changed, 65 insertions(+), 21 deletions(-)
> 
> Index: multipath-tools-111219/libmultipath/checkers/tur.c
> ===================================================================
> --- multipath-tools-111219.orig/libmultipath/checkers/tur.c
> +++ multipath-tools-111219/libmultipath/checkers/tur.c
> @@ -35,10 +35,15 @@ struct tur_checker_context {
>  	dev_t devt;
>  	int state;
>  	int running;
> -	time_t timeout;
> +	int fd;
> +	unsigned int timeout;
> +	time_t time;
>  	pthread_t thread;
>  	pthread_mutex_t lock;
>  	pthread_cond_t active;
> +	pthread_spinlock_t hldr_lock;
> +	int holders;
> +	char message[CHECKER_MSG_LEN];
>  };
>  
>  #define TUR_DEVT(c) major((c)->devt), minor((c)->devt)
> @@ -53,28 +58,49 @@ int libcheck_init (struct checker * c)
>  	memset(ct, 0, sizeof(struct tur_checker_context));
>  
>  	ct->state = PATH_UNCHECKED;
> +	ct->fd = -1;
> +	ct->holders = 1;
>  	pthread_cond_init(&ct->active, NULL);
>  	pthread_mutex_init(&ct->lock, NULL);
> +	pthread_spin_init(&ct->hldr_lock, PTHREAD_PROCESS_PRIVATE);
>  	c->context = ct;
>  
>  	return 0;
>  }
>  
> +void cleanup_context(struct tur_checker_context *ct)
> +{
> +	pthread_mutex_destroy(&ct->lock);
> +	pthread_cond_destroy(&ct->active);
> +	pthread_spin_destroy(&ct->hldr_lock);
> +	free(ct);
> +}
> +
>  void libcheck_free (struct checker * c)
>  {
>  	if (c->context) {
>  		struct tur_checker_context *ct = c->context;
> +		int holders;
> +		pthread_t thread;
>  
> -		pthread_mutex_destroy(&ct->lock);
> -		pthread_cond_destroy(&ct->active);
> -		free(c->context);
> +		pthread_spin_lock(&ct->hldr_lock);
> +		ct->holders--;
> +		holders = ct->holders;
> +		thread = ct->thread;
> +		pthread_spin_unlock(&ct->hldr_lock);
> +		if (holders)
> +			pthread_cancel(thread);
> +		else
> +			cleanup_context(ct);
>  		c->context = NULL;
>  	}
>  	return;
>  }
>  
> +#define TUR_MSG(msg, fmt, args...) snprintf(msg, CHECKER_MSG_LEN, fmt, ##args);
> +
>  int
> -tur_check (struct checker * c)
> +tur_check(int fd, unsigned int timeout, char *msg)
>  {
>  	struct sg_io_hdr io_hdr;
>  	unsigned char turCmdBlk[TUR_CMD_LEN] = { 0x00, 0, 0, 0, 0, 0 };
> @@ -90,10 +116,10 @@ tur_check (struct checker * c)
>  	io_hdr.dxfer_direction = SG_DXFER_NONE;
>  	io_hdr.cmdp = turCmdBlk;
>  	io_hdr.sbp = sense_buffer;
> -	io_hdr.timeout = c->timeout;
> +	io_hdr.timeout = timeout;
>  	io_hdr.pack_id = 0;
> -	if (ioctl(c->fd, SG_IO, &io_hdr) < 0) {
> -		MSG(c, MSG_TUR_DOWN);
> +	if (ioctl(fd, SG_IO, &io_hdr) < 0) {
> +		TUR_MSG(msg, MSG_TUR_DOWN);
>  		return PATH_DOWN;
>  	}
>  	if ((io_hdr.status & 0x7e) == 0x18) {
> @@ -101,7 +127,7 @@ tur_check (struct checker * c)
>  		 * SCSI-3 arrays might return
>  		 * reservation conflict on TUR
>  		 */
> -		MSG(c, MSG_TUR_UP);
> +		TUR_MSG(msg, MSG_TUR_UP);
>  		return PATH_UP;
>  	}
>  	if (io_hdr.info & SG_INFO_OK_MASK) {
> @@ -146,14 +172,14 @@ tur_check (struct checker * c)
>  				 * LOGICAL UNIT NOT ACCESSIBLE,
>  				 * TARGET PORT IN STANDBY STATE
>  				 */
> -				MSG(c, MSG_TUR_GHOST);
> +				TUR_MSG(msg, MSG_TUR_GHOST);
>  				return PATH_GHOST;
>  			}
>  		}
> -		MSG(c, MSG_TUR_DOWN);
> +		TUR_MSG(msg, MSG_TUR_DOWN);
>  		return PATH_DOWN;
>  	}
> -	MSG(c, MSG_TUR_UP);
> +	TUR_MSG(msg, MSG_TUR_UP);
>  	return PATH_UP;
>  }
>  
> @@ -162,18 +188,25 @@ tur_check (struct checker * c)
>  
>  void cleanup_func(void *data)
>  {
> +	int holders;
>  	struct tur_checker_context *ct = data;
> +	pthread_spin_lock(&ct->hldr_lock);
> +	ct->holders--;
> +	holders = ct->holders;
>  	ct->thread = 0;
> +	pthread_spin_unlock(&ct->hldr_lock);
> +	if (!holders)
> +		cleanup_context(ct);
>  }
>  
>  void *tur_thread(void *ctx)
>  {
> -	struct checker *c = ctx;
> -	struct tur_checker_context *ct = c->context;
> +	struct tur_checker_context *ct = ctx;
>  	int state;
>  
>  	condlog(3, "%d:%d: tur checker starting up", TUR_DEVT(ct));
>  
> +	ct->message[0] = '\0';
>  	/* This thread can be canceled, so setup clean up */
>  	tur_thread_cleanup_push(ct)
>  
> @@ -182,7 +215,7 @@ void *tur_thread(void *ctx)
>  	ct->state = PATH_PENDING;
>  	pthread_mutex_unlock(&ct->lock);
>  
> -	state = tur_check(c);
> +	state = tur_check(ct->fd, ct->timeout, ct->message);
>  
>  	/* TUR checker done */
>  	pthread_mutex_lock(&ct->lock);
> @@ -213,7 +246,7 @@ void tur_set_async_timeout(struct checke
>  	struct timeval now;
>  
>  	gettimeofday(&now, NULL);
> -	ct->timeout = now.tv_sec + c->timeout;
> +	ct->time = now.tv_sec + c->timeout;
>  }
>  
>  int tur_check_async_timeout(struct checker *c)
> @@ -222,7 +255,7 @@ int tur_check_async_timeout(struct check
>  	struct timeval now;
>  
>  	gettimeofday(&now, NULL);
> -	return (now.tv_sec > ct->timeout);
> +	return (now.tv_sec > ct->time);
>  }
>  
>  extern int
> @@ -242,7 +275,7 @@ libcheck_check (struct checker * c)
>  		ct->devt = sb.st_rdev;
>  
>  	if (c->sync)
> -		return tur_check(c);
> +		return tur_check(c->fd, c->timeout, c->message);
>  
>  	/*
>  	 * Async mode
> @@ -276,6 +309,8 @@ libcheck_check (struct checker * c)
>  			/* TUR checker done */
>  			ct->running = 0;
>  			tur_status = ct->state;
> +			strncpy(c->message, ct->message, CHECKER_MSG_LEN);
> +			c->message[CHECKER_MSG_LEN - 1] = '\0';
>  		}
>  		pthread_mutex_unlock(&ct->lock);
>  	} else {
> @@ -284,24 +319,32 @@ libcheck_check (struct checker * c)
>  			pthread_mutex_unlock(&ct->lock);
>  			condlog(3, "%d:%d: tur thread not responding, "
>  				"using sync mode", TUR_DEVT(ct));
> -			return tur_check(c);
> +			return tur_check(c->fd, c->timeout, c->message);
>  		}
>  		/* Start new TUR checker */
>  		ct->state = PATH_UNCHECKED;
> +		ct->fd = c->fd;
> +		ct->timeout = c->timeout;
> +		pthread_spin_lock(&ct->hldr_lock);
> +		ct->holders++;
> +		pthread_spin_unlock(&ct->hldr_lock);
>  		tur_set_async_timeout(c);
>  		setup_thread_attr(&attr, 32 * 1024, 1);
> -		r = pthread_create(&ct->thread, &attr, tur_thread, c);
> +		r = pthread_create(&ct->thread, &attr, tur_thread, ct);
>  		if (r) {
>  			pthread_mutex_unlock(&ct->lock);
>  			ct->thread = 0;
> +			ct->holders--;
>  			condlog(3, "%d:%d: failed to start tur thread, using"
>  				" sync mode", TUR_DEVT(ct));
> -			return tur_check(c);
> +			return tur_check(c->fd, c->timeout, c->message);
>  		}
>  		pthread_attr_destroy(&attr);
>  		tur_timeout(&tsp);
>  		r = pthread_cond_timedwait(&ct->active, &ct->lock, &tsp);
>  		tur_status = ct->state;
> +		strncpy(c->message, ct->message,CHECKER_MSG_LEN);
> +		c->message[CHECKER_MSG_LEN -1] = '\0';
>  		pthread_mutex_unlock(&ct->lock);
>  		if (ct->thread &&
>  		    (tur_status == PATH_PENDING || tur_status == PATH_UNCHECKED)) {
> Index: multipath-tools-111219/libmultipath/discovery.c
> ===================================================================
> --- multipath-tools-111219.orig/libmultipath/discovery.c
> +++ multipath-tools-111219/libmultipath/discovery.c
> @@ -826,6 +826,7 @@ get_state (struct path * pp, int daemon)
>  		}
>  		checker_set_fd(c, pp->fd);
>  		if (checker_init(c, pp->mpp?&pp->mpp->mpcontext:NULL)) {
> +			memset(c, 0x0, sizeof(struct checker));
>  			condlog(3, "%s: checker init failed", pp->dev);
>  			return PATH_UNCHECKED;
>  		}
> 
Hmm. Why do we need ->holders here?
In theory we can have only two states; async thread running or no
thread running. I really cannot imagine a scenario where ->holders
would be > 1.
Or, to stress the point a bit more, any scenario where holders > 1
would most definitely be a bug, as this would indicate that an async
tur thread is running, but we somehow failed to notice this, and
started a new one. Which means we end with two threads doing exactly
the same thing. And which was _exactly_ what we want to avoid with
at startup.
So I guess we can do away with ->holders.
And once ->holders is gone, I doubt it'll make any sense to retain
the spinlock, as we then just have ->thread to worry about.
And this doesn't need to be spinlock protected, as we take care to
synchronize during startup. And during shutdown we simply don't
care, as pthread_cancel() will return an error if the thread is
already done for.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@xxxxxxx			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

--
dm-devel mailing list
dm-devel@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/dm-devel



[Index of Archives]     [DM Crypt]     [Fedora Desktop]     [ATA RAID]     [Fedora Marketing]     [Fedora Packaging]     [Fedora SELinux]     [Yosemite Discussion]     [KDE Users]     [Fedora Docs]

  Powered by Linux