[Patch 4/7] tabled: retry conflicting locks
This problem was with us for a while, and even with this fix our start-up
is not reliable. But at least we will not be 100% guaranteed to hang as
before when restarting too quickly. So although the whole area needs some
serious reworking, this specific case was just too annoying to let it
continue.
Signed-Off-By: Pete Zaitcev <zaitcev@xxxxxxxxxx>
---
server/cldu.c | 38 ++++++++++++++++++++++++++++++++++----
1 file changed, 34 insertions(+), 4 deletions(-)
commit fa910aacff5118664177f988029cc5f8e6ef886d
Author: Master <zaitcev@xxxxxxxxxxxxxxxxxx>
Date: Thu Jan 14 19:56:13 2010 -0700
Retry the lock conflict.
diff --git a/server/cldu.c b/server/cldu.c
index 273f149..1d61672 100644
--- a/server/cldu.c
+++ b/server/cldu.c
@@ -59,6 +59,7 @@ struct cld_session {
* using sleep(), neither of the timers must ever be active simultane-
* ously with any other. But using one timer structure is too annoying.
*/
+ struct event tm_relock;
struct event tm_retry;
struct event tm_rescan;
struct event tm_reopen;
@@ -85,6 +86,7 @@ static int cldu_set_cldc(struct cld_session *sp, int newactive);
static int cldu_new_sess(struct cldc_call_opts *carg, enum cle_err_codes errc);
static int cldu_open_c_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
+static void try_lock(struct cld_session *sp);
static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
static int cldu_put_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
static int cldu_get_1_cb(struct cldc_call_opts *carg, enum cle_err_codes errc);
@@ -99,6 +101,7 @@ static int cldu_close_y_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
static void add_remote(const char *name);
static void add_chunk_node(struct cld_session *sp, const char *name);
+static struct timeval cldu_relock_delay = { 10, 0 };
static struct timeval cldu_retry_delay = { 5, 0 };
static struct timeval cldu_rescan_delay = { 50, 0 };
static struct timeval cldu_reopen_delay = { 3, 0 };
@@ -168,6 +171,15 @@ err_oom:
return 0;
}
+static void cldu_tm_relock(int fd, short events, void *userdata)
+{
+ struct cld_session *sp = userdata;
+
+ if (debugging)
+ applog(LOG_DEBUG, "Retrying locking of %s", sp->ffname);
+ try_lock(sp);
+}
+
static void cldu_tm_retry(int fd, short events, void *userdata)
{
struct cld_session *sp = userdata;
@@ -454,8 +466,6 @@ static int cldu_open_c_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
{
struct cld_session *sp = carg->private;
- struct cldc_call_opts copts;
- int rc;
if (errc != CLE_OK) {
applog(LOG_ERR, "CLD open(%s) failed: %d", sp->ffname, errc);
@@ -473,6 +483,15 @@ static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
if (debugging)
applog(LOG_DEBUG, "CLD file \"%s\" created", sp->ffname);
+ try_lock(sp);
+ return 0;
+}
+
+static void try_lock(struct cld_session *sp)
+{
+ struct cldc_call_opts copts;
+ int rc;
+
/*
* Lock the file, in case two hosts got the same hostname.
*/
@@ -483,8 +502,6 @@ static int cldu_open_f_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
if (rc) {
applog(LOG_ERR, "cldc_lock call error %d", rc);
}
-
- return 0;
}
static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
@@ -497,6 +514,18 @@ static int cldu_lock_cb(struct cldc_call_opts *carg, enum cle_err_codes errc)
if (errc != CLE_OK) {
applog(LOG_ERR, "CLD lock(%s) failed: %d", sp->ffname, errc);
+ if (errc == CLE_LOCK_CONFLICT) {
+ /*
+ * The usual reason why we get a lock conflict is
+ * restarting too quickly and hitting the previous lock
+ * that is going to disappear soon.
+ *
+ * FIXME: However, it may also be that a master
+ * is ok and we should become a slave, e.g. start TDB.
+ * We do not support multi-node, but we should.
+ */
+ evtimer_add(&sp->tm_relock, &cldu_relock_delay);
+ }
return 0;
}
@@ -940,6 +969,7 @@ int cld_begin(const char *thishost, const char *thiscell)
{
static struct cld_session *sp = &ses;
+ evtimer_set(&ses.tm_relock, cldu_tm_relock, &ses);
evtimer_set(&ses.tm_retry, cldu_tm_retry, &ses);
evtimer_set(&ses.tm_rescan, cldu_tm_rescan, &ses);
evtimer_set(&ses.tm_reopen, cldu_tm_reopen, &ses);
--
To unsubscribe from this list: send the line "unsubscribe hail-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
[Linux USB Devel]
[Video for Linux]
[Linux Audio Users]
[Photo]
[Yosemite News]
[Yosemite Photos]
[Free Online Dating]
[Linux Kernel]
[Linux SCSI]
[XFree86]