Skip to content

Commit

Permalink
staging/lustre: fix build error on non-x86 platforms
Browse files Browse the repository at this point in the history
dump_trace() is only available on X86. Without it, Lustre's own
watchdog is broken. We can only dump current task's stack.

The client-side this code is much less likely to hit deadlocks and
it's probably OK to drop this altogether, since we hardly have any
ptlrpc threads on clients, most notable ones are ldlm cb threads
that should not really be blocking on the client anyway.

Remove libcfs watchdog for now, until the upstream kernel watchdog
can detect distributed deadlocks and dump other kernel threads.

Cc: Oleg Drokin <oleg.drokin@intel.com>
Signed-off-by: Peng Tao <tao.peng@emc.com>
  • Loading branch information
bergwolf committed Jul 9, 2013
1 parent fb8d38b commit 30cd59b
Show file tree
Hide file tree
Showing 12 changed files with 22 additions and 594 deletions.
25 changes: 0 additions & 25 deletions drivers/staging/lustre/include/linux/libcfs/libcfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,31 +117,6 @@ int libcfs_sock_write(socket_t *sock, void *buffer, int nob, int timeout);
int libcfs_sock_read(socket_t *sock, void *buffer, int nob, int timeout);
void libcfs_sock_release(socket_t *sock);

/* libcfs watchdogs */
struct lc_watchdog;

/* Add a watchdog which fires after "time" milliseconds of delay. You have to
* touch it once to enable it. */
struct lc_watchdog *lc_watchdog_add(int time,
void (*cb)(pid_t pid, void *),
void *data);

/* Enables a watchdog and resets its timer. */
void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout);
#define CFS_GET_TIMEOUT(svc) (max_t(int, obd_timeout, \
AT_OFF ? 0 : at_get(&svc->srv_at_estimate)) * \
svc->srv_watchdog_factor)

/* Disable a watchdog; touch it to restart it. */
void lc_watchdog_disable(struct lc_watchdog *lcw);

/* Clean up the watchdog */
void lc_watchdog_delete(struct lc_watchdog *lcw);

/* Dump a debug log */
void lc_watchdog_dumplog(pid_t pid, void *data);


/* need both kernel and user-land acceptor */
#define LNET_ACCEPTOR_MIN_RESERVED_PORT 512
#define LNET_ACCEPTOR_MAX_RESERVED_PORT 1023
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,6 @@ do { \
#define ntohs(x) ___ntohs(x)
#endif

void libcfs_debug_dumpstack(task_t *tsk);
void libcfs_run_upcall(char **argv);
void libcfs_run_lbug_upcall(struct libcfs_debug_msg_data *);
void libcfs_debug_dumplog(void);
Expand Down
4 changes: 2 additions & 2 deletions drivers/staging/lustre/lustre/include/linux/obd.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,9 @@ static inline void __client_obd_list_lock(client_obd_lock_t *lock,
(jiffies - lock->time) / HZ);
LCONSOLE_WARN("====== for process holding the "
"lock =====\n");
libcfs_debug_dumpstack(task);
dump_stack();

This comment has been minimized.

Copy link
@adilger

adilger Jul 10, 2013

I'd prefer to delete this line and the above comment, instead of printing some misleading information.

This comment has been minimized.

Copy link
@bergwolf

bergwolf Jul 10, 2013

Author Owner

Will do it before sending patches out. Thanks.

LCONSOLE_WARN("====== for current process =====\n");
libcfs_debug_dumpstack(NULL);
dump_stack();
LCONSOLE_WARN("====== end =======\n");
cfs_pause(1000 * HZ);
}
Expand Down
7 changes: 6 additions & 1 deletion drivers/staging/lustre/lustre/include/lustre_net.h
Original file line number Diff line number Diff line change
Expand Up @@ -2322,8 +2322,13 @@ struct ptlrpc_thread {
pid_t t_pid;
/**
* put watchdog in the structure per thread b=14840
*
* Lustre watchdog is removed for client in the hope
* of a generic watchdog can be merged in kernel.
* When that happens, we should add below back.
*
* struct lc_watchdog *t_watchdog;
*/
struct lc_watchdog *t_watchdog;
/**
* the svc this thread belonged to b=18582
*/
Expand Down
2 changes: 1 addition & 1 deletion drivers/staging/lustre/lustre/ldlm/ldlm_lock.c
Original file line number Diff line number Diff line change
Expand Up @@ -1548,7 +1548,7 @@ int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill,
break;
default:
LDLM_ERROR(lock, "Unknown LVB type: %d\n", lock->l_lvb_type);
libcfs_debug_dumpstack(NULL);
dump_stack();
RETURN(-EINVAL);
}

Expand Down
2 changes: 1 addition & 1 deletion drivers/staging/lustre/lustre/libcfs/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ libcfs-linux-objs += linux-crypto-adler.o
libcfs-linux-objs := $(addprefix linux/,$(libcfs-linux-objs))

libcfs-all-objs := debug.o fail.o nidstrings.o module.o tracefile.o \
watchdog.o libcfs_string.o hash.o kernel_user_comm.o \
libcfs_string.o hash.o kernel_user_comm.o \
prng.o workitem.o upcall_cache.o libcfs_cpu.o \
libcfs_mem.o libcfs_lock.o

Expand Down
45 changes: 1 addition & 44 deletions drivers/staging/lustre/lustre/libcfs/linux/linux-debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ void lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
/* not reached */
}

libcfs_debug_dumpstack(NULL);
dump_stack();
if (!libcfs_panic_on_lbug)
libcfs_debug_dumplog();
libcfs_run_lbug_upcall(msgdata);
Expand All @@ -179,48 +179,6 @@ void lbug_with_loc(struct libcfs_debug_msg_data *msgdata)
schedule();
}


#include <linux/nmi.h>
#include <asm/stacktrace.h>


static int print_trace_stack(void *data, char *name)
{
printk(" <%s> ", name);
return 0;
}

# define RELIABLE reliable
# define DUMP_TRACE_CONST const
static void print_trace_address(void *data, unsigned long addr, int reliable)
{
char fmt[32];
touch_nmi_watchdog();
sprintf(fmt, " [<%016lx>] %s%%s\n", addr, RELIABLE ? "": "? ");
__print_symbol(fmt, addr);
}

static DUMP_TRACE_CONST struct stacktrace_ops print_trace_ops = {
.stack = print_trace_stack,
.address = print_trace_address,
.walk_stack = print_context_stack,
};

void libcfs_debug_dumpstack(struct task_struct *tsk)
{
/* dump_stack() */
/* show_trace() */
if (tsk == NULL)
tsk = current;
printk("Pid: %d, comm: %.20s\n", tsk->pid, tsk->comm);
/* show_trace_log_lvl() */
printk("\nCall Trace:\n");
dump_trace(tsk, NULL, NULL,
0,
&print_trace_ops, NULL);
printk("\n");
}

task_t *libcfs_current(void)
{
CWARN("current task struct is %p\n", current);
Expand Down Expand Up @@ -255,7 +213,6 @@ void libcfs_unregister_panic_notifier(void)
atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier);
}

EXPORT_SYMBOL(libcfs_debug_dumpstack);
EXPORT_SYMBOL(libcfs_current);


Expand Down
Loading

2 comments on commit 30cd59b

@adilger
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of commenting out this code, it could be #ifdef CONFIG_TIMER_BASED_LOCKUP_DETECTOR or similar. That would make it more obvious that something needs to be done when your other patch lands, but that may also introduce dependencies on the code we don't want. I'm OK with it landing like this, so long as we don't forget about it.

@bergwolf
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the timer based lockup detector, all APIs have proper empty inline implementations so that we don't actually need to depend on CONFIG_TIMER_BASED_LOCKUP_DETECTOR.

Please sign in to comment.