diff --git a/config/kernel-declare-event-class.m4 b/config/kernel-declare-event-class.m4 new file mode 100644 index 000000000000..7867d751749d --- /dev/null +++ b/config/kernel-declare-event-class.m4 @@ -0,0 +1,59 @@ +dnl # +dnl # Ensure the DECLARE_EVENT_CLASS macro is available to non-GPL modules. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_DECLARE_EVENT_CLASS], [ + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-I\$(src)" + + AC_MSG_CHECKING([whether DECLARE_EVENT_CLASS() is available]) + ZFS_LINUX_TRY_COMPILE_HEADER([ + #include + MODULE_LICENSE(ZFS_META_LICENSE); + + #define CREATE_TRACE_POINTS + #include "conftest.h" + ],[ + trace_zfs_autoconf_event_one(1UL); + trace_zfs_autoconf_event_two(2UL); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_DECLARE_EVENT_CLASS, 1, + [DECLARE_EVENT_CLASS() is available]) + ],[ + AC_MSG_RESULT(no) + ],[ + #if !defined(_CONFTEST_H) || defined(TRACE_HEADER_MULTI_READ) + #define _CONFTEST_H + + #undef TRACE_SYSTEM + #define TRACE_SYSTEM zfs + #include + + DECLARE_EVENT_CLASS(zfs_autoconf_event_class, + TP_PROTO(unsigned long i), + TP_ARGS(i), + TP_STRUCT__entry( + __field(unsigned long, i) + ), + TP_fast_assign( + __entry->i = i; + ), + TP_printk("i = %lu", __entry->i) + ); + + #define DEFINE_AUTOCONF_EVENT(name) \ + DEFINE_EVENT(zfs_autoconf_event_class, name, \ + TP_PROTO(unsigned long i), \ + TP_ARGS(i)) + DEFINE_AUTOCONF_EVENT(zfs_autoconf_event_one); + DEFINE_AUTOCONF_EVENT(zfs_autoconf_event_two); + + #endif /* _CONFTEST_H */ + + #undef TRACE_INCLUDE_PATH + #define TRACE_INCLUDE_PATH . + #define TRACE_INCLUDE_FILE conftest + #include + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 9145dbd89bf4..d8784c9db9de 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -6,6 +6,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_SPL ZFS_AC_TEST_MODULE ZFS_AC_KERNEL_CONFIG + ZFS_AC_KERNEL_DECLARE_EVENT_CLASS ZFS_AC_KERNEL_BDEV_BLOCK_DEVICE_OPERATIONS ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID ZFS_AC_KERNEL_TYPE_FMODE_T @@ -506,9 +507,18 @@ AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEBUG_LOCK_ALLOC], [ ]) dnl # -dnl # ZFS_LINUX_CONFTEST +dnl # ZFS_LINUX_CONFTEST_H dnl # -AC_DEFUN([ZFS_LINUX_CONFTEST], [ +AC_DEFUN([ZFS_LINUX_CONFTEST_H], [ +cat - <<_ACEOF >conftest.h +$1 +_ACEOF +]) + +dnl # +dnl # ZFS_LINUX_CONFTEST_C +dnl # +AC_DEFUN([ZFS_LINUX_CONFTEST_C], [ cat confdefs.h - <<_ACEOF >conftest.c $1 _ACEOF @@ -534,13 +544,14 @@ dnl # dnl # ZFS_LINUX_COMPILE_IFELSE / like AC_COMPILE_IFELSE dnl # AC_DEFUN([ZFS_LINUX_COMPILE_IFELSE], [ - m4_ifvaln([$1], [ZFS_LINUX_CONFTEST([$1])]) + m4_ifvaln([$1], [ZFS_LINUX_CONFTEST_C([$1])]) + m4_ifvaln([$6], [ZFS_LINUX_CONFTEST_H([$6])], [ZFS_LINUX_CONFTEST_H([])]) rm -Rf build && mkdir -p build && touch build/conftest.mod.c echo "obj-m := conftest.o" >build/Makefile modpost_flag='' test "x$enable_linux_builtin" = xyes && modpost_flag='modpost=true' # fake modpost stage AS_IF( - [AC_TRY_COMMAND(cp conftest.c build && make [$2] -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag) >/dev/null && AC_TRY_COMMAND([$3])], + [AC_TRY_COMMAND(cp conftest.c conftest.h build && make [$2] -C $LINUX_OBJ EXTRA_CFLAGS="-Werror $EXTRA_KCFLAGS" $ARCH_UM M=$PWD/build $modpost_flag) >/dev/null && AC_TRY_COMMAND([$3])], [$4], [_AC_MSG_LOG_CONFTEST m4_ifvaln([$5],[$5])] ) @@ -627,3 +638,16 @@ AC_DEFUN([ZFS_LINUX_TRY_COMPILE_SYMBOL], [ fi fi ]) + +dnl # +dnl # ZFS_LINUX_TRY_COMPILE_HEADER +dnl # like ZFS_LINUX_TRY_COMPILE, except the contents conftest.h are +dnl # provided via the fifth parameter +dnl # +AC_DEFUN([ZFS_LINUX_TRY_COMPILE_HEADER], + [ZFS_LINUX_COMPILE_IFELSE( + [AC_LANG_SOURCE([ZFS_LANG_PROGRAM([[$1]], [[$2]])])], + [modules], + [test -s build/conftest.o], + [$3], [$4], [AC_LANG_SOURCE([$5])]) +]) diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index a755b394da9d..7ddace00d8cf 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -2,6 +2,7 @@ SUBDIRS = fm fs COMMON_H = \ $(top_srcdir)/include/sys/arc.h \ + $(top_srcdir)/include/sys/arc_impl.h \ $(top_srcdir)/include/sys/avl.h \ $(top_srcdir)/include/sys/avl_impl.h \ $(top_srcdir)/include/sys/blkptr.h \ @@ -39,11 +40,13 @@ COMMON_H = \ $(top_srcdir)/include/sys/rrwlock.h \ $(top_srcdir)/include/sys/sa.h \ $(top_srcdir)/include/sys/sa_impl.h \ + $(top_srcdir)/include/sys/sdt.h \ $(top_srcdir)/include/sys/spa_boot.h \ $(top_srcdir)/include/sys/space_map.h \ $(top_srcdir)/include/sys/space_reftree.h \ $(top_srcdir)/include/sys/spa.h \ $(top_srcdir)/include/sys/spa_impl.h \ + $(top_srcdir)/include/sys/trace.h \ $(top_srcdir)/include/sys/txg.h \ $(top_srcdir)/include/sys/txg_impl.h \ $(top_srcdir)/include/sys/u8_textprep_data.h \ diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h new file mode 100644 index 000000000000..f7e1295a5c2a --- /dev/null +++ b/include/sys/arc_impl.h @@ -0,0 +1,157 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _SYS_ARC_IMPL_H +#define _SYS_ARC_IMPL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Note that buffers can be in one of 6 states: + * ARC_anon - anonymous (discussed below) + * ARC_mru - recently used, currently cached + * ARC_mru_ghost - recentely used, no longer in cache + * ARC_mfu - frequently used, currently cached + * ARC_mfu_ghost - frequently used, no longer in cache + * ARC_l2c_only - exists in L2ARC but not other states + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitly. + * + * Anonymous buffers are buffers that are not associated with + * a DVA. These are buffers that hold dirty block copies + * before they are written to stable storage. By definition, + * they are "ref'd" and are considered part of arc_mru + * that cannot be freed. Generally, they will aquire a DVA + * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. + */ + +typedef struct arc_state { + list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ + uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ + uint64_t arcs_size; /* total amount of data in this state */ + kmutex_t arcs_mtx; + arc_state_type_t arcs_state; +} arc_state_t; + +typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; + +typedef struct arc_callback arc_callback_t; + +struct arc_callback { + void *acb_private; + arc_done_func_t *acb_done; + arc_buf_t *acb_buf; + zio_t *acb_zio_dummy; + arc_callback_t *acb_next; +}; + +typedef struct arc_write_callback arc_write_callback_t; + +struct arc_write_callback { + void *awcb_private; + arc_done_func_t *awcb_ready; + arc_done_func_t *awcb_physdone; + arc_done_func_t *awcb_done; + arc_buf_t *awcb_buf; +}; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + uint64_t b_cksum0; + + kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; + + arc_buf_hdr_t *b_hash_next; + arc_buf_t *b_buf; + uint32_t b_flags; + uint32_t b_datacnt; + + arc_callback_t *b_acb; + kcondvar_t b_cv; + + /* immutable */ + arc_buf_contents_t b_type; + uint64_t b_size; + uint64_t b_spa; + + /* protected by arc state mutex */ + arc_state_t *b_state; + list_node_t b_arc_node; + + /* updated atomically */ + clock_t b_arc_access; + uint32_t b_mru_hits; + uint32_t b_mru_ghost_hits; + uint32_t b_mfu_hits; + uint32_t b_mfu_ghost_hits; + uint32_t b_l2_hits; + + /* self protecting */ + refcount_t b_refcnt; + + l2arc_buf_hdr_t *b_l2hdr; + list_node_t b_l2node; +}; + +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + uint64_t l2ad_evict; /* last addr eviction reached */ + boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ + list_t *l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ +} l2arc_dev_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ARC_IMPL_H */ diff --git a/include/sys/sdt.h b/include/sys/sdt.h new file mode 100644 index 000000000000..56efa1b3998b --- /dev/null +++ b/include/sys/sdt.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SDT_H +#define _SYS_SDT_H + +#ifndef _KERNEL + +#define ZFS_PROBE(a) ((void) 0) +#define ZFS_PROBE1(a, c) ((void) 0) +#define ZFS_PROBE2(a, c, e) ((void) 0) +#define ZFS_PROBE3(a, c, e, g) ((void) 0) +#define ZFS_PROBE4(a, c, e, g, i) ((void) 0) +#define ZFS_SET_ERROR(err) ((void) 0) + +#else + +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#include + +/* + * The set-error SDT probe is extra static, in that we declare its fake + * function literally, rather than with the DTRACE_PROBE1() macro. This is + * necessary so that SET_ERROR() can evaluate to a value, which wouldn't + * be possible if it required multiple statements (to declare the function + * and then call it). + * + * SET_ERROR() uses the comma operator so that it can be used without much + * additional code. For example, "return (EINVAL);" becomes + * "return (SET_ERROR(EINVAL));". Note that the argument will be evaluated + * twice, so it should not have side effects (e.g. something like: + * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice). + */ +#define SET_ERROR(err) \ + (trace_zfs_set__error(__FILE__, __func__, __LINE__, err), err) + +#else + +#undef SET_ERROR +#define SET_ERROR(err) (err) + +#endif /* HAVE_DECLARE_EVENT_CLASS */ + +#endif /* _KERNEL */ + +#endif /* _SYS_SDT_H */ diff --git a/include/sys/trace.h b/include/sys/trace.h new file mode 100644 index 000000000000..c73b15ac3ddb --- /dev/null +++ b/include/sys/trace.h @@ -0,0 +1,1038 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Prakash Surya. All rights reserved. + */ + +#if defined(_KERNEL) && defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#if !defined(_TRACE_ZFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ZFS_H + +#include +#include +#include + +/* + * Redefine the DTRACE_PROBE* functions to use Linux tracepoints + */ +#undef DTRACE_PROBE1 +#define DTRACE_PROBE1(name, t1, arg1) \ + trace_zfs_##name((arg1)) + +#undef DTRACE_PROBE2 +#define DTRACE_PROBE2(name, t1, arg1, t2, arg2) \ + trace_zfs_##name((arg1), (arg2)) + +#undef DTRACE_PROBE3 +#define DTRACE_PROBE3(name, t1, arg1, t2, arg2, t3, arg3) \ + trace_zfs_##name((arg1), (arg2), (arg3)) + +#undef DTRACE_PROBE4 +#define DTRACE_PROBE4(name, t1, arg1, t2, arg2, t3, arg3, t4, arg4) \ + trace_zfs_##name((arg1), (arg2), (arg3), (arg4)) + +typedef struct arc_buf_hdr arc_buf_hdr_t; +typedef struct zio zio_t; +typedef struct vdev vdev_t; +typedef struct l2arc_write_callback l2arc_write_callback_t; +typedef struct blkptr blkptr_t; +typedef struct zbookmark_phys zbookmark_phys_t; +typedef struct l2arc_dev l2arc_dev_t; +typedef struct dmu_buf_impl dmu_buf_impl_t; +typedef struct dmu_tx dmu_tx_t; +typedef struct dnode dnode_t; +typedef struct dsl_pool dsl_pool_t; +typedef struct znode znode_t; +typedef struct zfs_ace_hdr zfs_ace_hdr_t; +typedef struct zilog zilog_t; +typedef struct zrlock zrlock_t; + +/* + * Generic support for one argument tracepoints of the form: + * + * DTRACE_PROBE1(..., + * arc_buf_hdr_t *, ...); + */ +DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, + TP_PROTO(arc_buf_hdr_t *ab), + TP_ARGS(ab), + TP_STRUCT__entry( + __array(uint64_t, hdr_dva_word, 2) + __field(uint64_t, hdr_birth) + __field(uint64_t, hdr_cksum0) + __field(uint32_t, hdr_flags) + __field(uint32_t, hdr_datacnt) + __field(arc_buf_contents_t, hdr_type) + __field(uint64_t, hdr_size) + __field(uint64_t, hdr_spa) + __field(arc_state_type_t, hdr_state_type) + __field(clock_t, hdr_access) + __field(uint32_t, hdr_mru_hits) + __field(uint32_t, hdr_mru_ghost_hits) + __field(uint32_t, hdr_mfu_hits) + __field(uint32_t, hdr_mfu_ghost_hits) + __field(uint32_t, hdr_l2_hits) + __field(int64_t, hdr_refcount) + ), + TP_fast_assign( + __entry->hdr_dva_word[0] = ab->b_dva.dva_word[0]; + __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; + __entry->hdr_birth = ab->b_birth; + __entry->hdr_cksum0 = ab->b_cksum0; + __entry->hdr_flags = ab->b_flags; + __entry->hdr_datacnt = ab->b_datacnt; + __entry->hdr_type = ab->b_type; + __entry->hdr_size = ab->b_size; + __entry->hdr_spa = ab->b_spa; + __entry->hdr_state_type = ab->b_state->arcs_state; + __entry->hdr_access = ab->b_arc_access; + __entry->hdr_mru_hits = ab->b_mru_hits; + __entry->hdr_mru_ghost_hits = ab->b_mru_ghost_hits; + __entry->hdr_mfu_hits = ab->b_mfu_hits; + __entry->hdr_mfu_ghost_hits = ab->b_mfu_ghost_hits; + __entry->hdr_l2_hits = ab->b_l2_hits; + __entry->hdr_refcount = ab->b_refcnt.rc_count; + ), + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " + "flags 0x%x datacnt %u type %u size %llu spa %llu " + "state_type %u access %lu mru_hits %u mru_ghost_hits %u " + "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", + __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], + __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, + __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, + __entry->hdr_spa, __entry->hdr_state_type, + __entry->hdr_access, __entry->hdr_mru_hits, + __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, + __entry->hdr_mfu_ghost_hits, __entry->hdr_l2_hits, + __entry->hdr_refcount) +); + +#define DEFINE_ARC_BUF_HDR_EVENT(name) \ +DEFINE_EVENT(zfs_arc_buf_hdr_class, name, \ + TP_PROTO(arc_buf_hdr_t *ab), \ + TP_ARGS(ab)) +DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__hit); +DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict); +DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete); +DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru); +DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu); +DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit); +DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss); + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * vdev_t *, ..., + * zio_t *, ...); + */ +#define ZIO_TP_STRUCT_ENTRY \ + __field(zio_type_t, zio_type) \ + __field(int, zio_cmd) \ + __field(zio_priority_t, zio_priority) \ + __field(uint64_t, zio_size) \ + __field(uint64_t, zio_orig_size) \ + __field(uint64_t, zio_offset) \ + __field(hrtime_t, zio_timestamp) \ + __field(hrtime_t, zio_delta) \ + __field(uint64_t, zio_delay) \ + __field(enum zio_flag, zio_flags) \ + __field(enum zio_stage, zio_stage) \ + __field(enum zio_stage, zio_pipeline) \ + __field(enum zio_flag, zio_orig_flags) \ + __field(enum zio_stage, zio_orig_stage) \ + __field(enum zio_stage, zio_orig_pipeline) \ + __field(uint8_t, zio_reexecute) \ + __field(uint64_t, zio_txg) \ + __field(int, zio_error) \ + __field(uint64_t, zio_ena) \ + \ + __field(enum zio_checksum, zp_checksum) \ + __field(enum zio_compress, zp_compress) \ + __field(dmu_object_type_t, zp_type) \ + __field(uint8_t, zp_level) \ + __field(uint8_t, zp_copies) \ + __field(boolean_t, zp_dedup) \ + __field(boolean_t, zp_dedup_verify) \ + __field(boolean_t, zp_nopwrite) + +#define ZIO_TP_FAST_ASSIGN \ + __entry->zio_type = zio->io_type; \ + __entry->zio_cmd = zio->io_cmd; \ + __entry->zio_priority = zio->io_priority; \ + __entry->zio_size = zio->io_size; \ + __entry->zio_orig_size = zio->io_orig_size; \ + __entry->zio_offset = zio->io_offset; \ + __entry->zio_timestamp = zio->io_timestamp; \ + __entry->zio_delta = zio->io_delta; \ + __entry->zio_delay = zio->io_delay; \ + __entry->zio_flags = zio->io_flags; \ + __entry->zio_stage = zio->io_stage; \ + __entry->zio_pipeline = zio->io_pipeline; \ + __entry->zio_orig_flags = zio->io_orig_flags; \ + __entry->zio_orig_stage = zio->io_orig_stage; \ + __entry->zio_orig_pipeline = zio->io_orig_pipeline; \ + __entry->zio_reexecute = zio->io_reexecute; \ + __entry->zio_txg = zio->io_txg; \ + __entry->zio_error = zio->io_error; \ + __entry->zio_ena = zio->io_ena; \ + \ + __entry->zp_checksum = zio->io_prop.zp_checksum; \ + __entry->zp_compress = zio->io_prop.zp_compress; \ + __entry->zp_type = zio->io_prop.zp_type; \ + __entry->zp_level = zio->io_prop.zp_level; \ + __entry->zp_copies = zio->io_prop.zp_copies; \ + __entry->zp_dedup = zio->io_prop.zp_dedup; \ + __entry->zp_nopwrite = zio->io_prop.zp_nopwrite; \ + __entry->zp_dedup_verify = zio->io_prop.zp_dedup_verify; + +#define ZIO_TP_PRINTK_FMT \ + "zio { type %u cmd %i prio %u size %llu orig_size %llu " \ + "offset %llu timestamp %llu delta %llu delay %llu " \ + "flags 0x%x stage 0x%x pipeline 0x%x orig_flags 0x%x " \ + "orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \ + "txg %llu error %d ena %llu prop { checksum %u compress %u " \ + "type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }" + +#define ZIO_TP_PRINTK_ARGS \ + __entry->zio_type, __entry->zio_cmd, __entry->zio_priority, \ + __entry->zio_size, __entry->zio_orig_size, __entry->zio_offset, \ + __entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \ + __entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \ + __entry->zio_orig_flags, __entry->zio_orig_stage, \ + __entry->zio_orig_pipeline, __entry->zio_reexecute, \ + __entry->zio_txg, __entry->zio_error, __entry->zio_ena, \ + __entry->zp_checksum, __entry->zp_compress, __entry->zp_type, \ + __entry->zp_level, __entry->zp_copies, __entry->zp_dedup, \ + __entry->zp_dedup_verify, __entry->zp_nopwrite + + +DECLARE_EVENT_CLASS(zfs_l2arc_rw_class, + TP_PROTO(vdev_t *vd, zio_t *zio), + TP_ARGS(vd, zio), + TP_STRUCT__entry( + __field(uint64_t, vdev_id) + __field(uint64_t, vdev_guid) + __field(uint64_t, vdev_state) + ZIO_TP_STRUCT_ENTRY + ), + TP_fast_assign( + __entry->vdev_id = vd->vdev_id; + __entry->vdev_guid = vd->vdev_guid; + __entry->vdev_state = vd->vdev_state; + ZIO_TP_FAST_ASSIGN + ), + TP_printk("vdev { id %llu guid %llu state %llu } " + ZIO_TP_PRINTK_FMT, __entry->vdev_id, __entry->vdev_guid, + __entry->vdev_state, ZIO_TP_PRINTK_ARGS) +); + +#define DEFINE_L2ARC_RW_EVENT(name) \ +DEFINE_EVENT(zfs_l2arc_rw_class, name, \ + TP_PROTO(vdev_t *vd, zio_t *zio), \ + TP_ARGS(vd, zio)) +DEFINE_L2ARC_RW_EVENT(zfs_l2arc__read); +DEFINE_L2ARC_RW_EVENT(zfs_l2arc__write); + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * zio_t *, ..., + * l2arc_write_callback_t *, ...); + */ +DECLARE_EVENT_CLASS(zfs_l2arc_iodone_class, + TP_PROTO(zio_t *zio, l2arc_write_callback_t *cb), + TP_ARGS(zio, cb), + TP_STRUCT__entry(ZIO_TP_STRUCT_ENTRY), + TP_fast_assign(ZIO_TP_FAST_ASSIGN), + TP_printk(ZIO_TP_PRINTK_FMT, ZIO_TP_PRINTK_ARGS) +); + +#define DEFINE_L2ARC_IODONE_EVENT(name) \ +DEFINE_EVENT(zfs_l2arc_iodone_class, name, \ + TP_PROTO(zio_t *zio, l2arc_write_callback_t *cb), \ + TP_ARGS(zio, cb)) +DEFINE_L2ARC_IODONE_EVENT(zfs_l2arc__iodone); + +/* + * Generic support for four argument tracepoints of the form: + * + * DTRACE_PROBE4(..., + * arc_buf_hdr_t *, ..., + * const blkptr_t *, + * uint64_t, + * const zbookmark_phys_t *); + */ +DECLARE_EVENT_CLASS(zfs_arc_miss_class, + TP_PROTO(arc_buf_hdr_t *hdr, + const blkptr_t *bp, uint64_t size, const zbookmark_phys_t *zb), + TP_ARGS(hdr, bp, size, zb), + TP_STRUCT__entry( + __array(uint64_t, hdr_dva_word, 2) + __field(uint64_t, hdr_birth) + __field(uint64_t, hdr_cksum0) + __field(uint32_t, hdr_flags) + __field(uint32_t, hdr_datacnt) + __field(arc_buf_contents_t, hdr_type) + __field(uint64_t, hdr_size) + __field(uint64_t, hdr_spa) + __field(arc_state_type_t, hdr_state_type) + __field(clock_t, hdr_access) + __field(uint32_t, hdr_mru_hits) + __field(uint32_t, hdr_mru_ghost_hits) + __field(uint32_t, hdr_mfu_hits) + __field(uint32_t, hdr_mfu_ghost_hits) + __field(uint32_t, hdr_l2_hits) + __field(int64_t, hdr_refcount) + + __array(uint64_t, bp_dva0, 2) + __array(uint64_t, bp_dva1, 2) + __array(uint64_t, bp_dva2, 2) + __array(uint64_t, bp_cksum, 4) + + __field(uint64_t, bp_lsize) + + __field(uint64_t, zb_objset) + __field(uint64_t, zb_object) + __field(int64_t, zb_level) + __field(uint64_t, zb_blkid) + ), + TP_fast_assign( + __entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0]; + __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; + __entry->hdr_birth = hdr->b_birth; + __entry->hdr_cksum0 = hdr->b_cksum0; + __entry->hdr_flags = hdr->b_flags; + __entry->hdr_datacnt = hdr->b_datacnt; + __entry->hdr_type = hdr->b_type; + __entry->hdr_size = hdr->b_size; + __entry->hdr_spa = hdr->b_spa; + __entry->hdr_state_type = hdr->b_state->arcs_state; + __entry->hdr_access = hdr->b_arc_access; + __entry->hdr_mru_hits = hdr->b_mru_hits; + __entry->hdr_mru_ghost_hits = hdr->b_mru_ghost_hits; + __entry->hdr_mfu_hits = hdr->b_mfu_hits; + __entry->hdr_mfu_ghost_hits = hdr->b_mfu_ghost_hits; + __entry->hdr_l2_hits = hdr->b_l2_hits; + __entry->hdr_refcount = hdr->b_refcnt.rc_count; + + __entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0]; + __entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1]; + __entry->bp_dva1[0] = bp->blk_dva[1].dva_word[0]; + __entry->bp_dva1[1] = bp->blk_dva[1].dva_word[1]; + __entry->bp_dva2[0] = bp->blk_dva[2].dva_word[0]; + __entry->bp_dva2[1] = bp->blk_dva[2].dva_word[1]; + __entry->bp_cksum[0] = bp->blk_cksum.zc_word[0]; + __entry->bp_cksum[1] = bp->blk_cksum.zc_word[1]; + __entry->bp_cksum[2] = bp->blk_cksum.zc_word[2]; + __entry->bp_cksum[3] = bp->blk_cksum.zc_word[3]; + + __entry->bp_lsize = size; + + __entry->zb_objset = zb->zb_objset; + __entry->zb_object = zb->zb_object; + __entry->zb_level = zb->zb_level; + __entry->zb_blkid = zb->zb_blkid; + ), + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx " + "flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u " + "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " + "mfu_ghost_hits %u l2_hits %u refcount %lli } " + "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " + "0x%llx:0x%llx cksum 0x%llx:0x%llx:0x%llx:0x%llx " + "lsize %llu } zb { objset %llu object %llu level %lli " + "blkid %llu }", + __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], + __entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags, + __entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size, + __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, + __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, + __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, + __entry->hdr_l2_hits, __entry->hdr_refcount, + __entry->bp_dva0[0], __entry->bp_dva0[1], + __entry->bp_dva1[0], __entry->bp_dva1[1], + __entry->bp_dva2[0], __entry->bp_dva2[1], + __entry->bp_cksum[0], __entry->bp_cksum[1], + __entry->bp_cksum[2], __entry->bp_cksum[3], + __entry->bp_lsize, __entry->zb_objset, __entry->zb_object, + __entry->zb_level, __entry->zb_blkid) +); + +#define DEFINE_ARC_MISS_EVENT(name) \ +DEFINE_EVENT(zfs_arc_miss_class, name, \ + TP_PROTO(arc_buf_hdr_t *hdr, \ + const blkptr_t *bp, uint64_t size, const zbookmark_phys_t *zb), \ + TP_ARGS(hdr, bp, size, zb)) +DEFINE_ARC_MISS_EVENT(zfs_arc__miss); + +/* + * Generic support for four argument tracepoints of the form: + * + * DTRACE_PROBE4(..., + * l2arc_dev_t *, ..., + * list_t *, ..., + * uint64_t, ..., + * boolean_t, ...); + */ +DECLARE_EVENT_CLASS(zfs_l2arc_evict_class, + TP_PROTO(l2arc_dev_t *dev, + list_t *buflist, uint64_t taddr, boolean_t all), + TP_ARGS(dev, buflist, taddr, all), + TP_STRUCT__entry( + __field(uint64_t, vdev_id) + __field(uint64_t, vdev_guid) + __field(uint64_t, vdev_state) + + __field(uint64_t, l2ad_hand) + __field(uint64_t, l2ad_start) + __field(uint64_t, l2ad_end) + __field(uint64_t, l2ad_evict) + __field(boolean_t, l2ad_first) + __field(boolean_t, l2ad_writing) + + __field(uint64_t, taddr) + __field(boolean_t, all) + ), + TP_fast_assign( + __entry->vdev_id = dev->l2ad_vdev->vdev_id; + __entry->vdev_guid = dev->l2ad_vdev->vdev_guid; + __entry->vdev_state = dev->l2ad_vdev->vdev_state; + + __entry->l2ad_hand = dev->l2ad_hand; + __entry->l2ad_start = dev->l2ad_start; + __entry->l2ad_end = dev->l2ad_end; + __entry->l2ad_evict = dev->l2ad_evict; + __entry->l2ad_first = dev->l2ad_first; + __entry->l2ad_writing = dev->l2ad_writing; + + __entry->taddr = taddr; + __entry->all = all; + ), + TP_printk("l2ad { vdev { id %llu guid %llu state %llu } " + "hand %llu start %llu end %llu evict %llu " + "first %d writing %d } taddr %llu all %d", + __entry->vdev_id, __entry->vdev_guid, __entry->vdev_state, + __entry->l2ad_hand, __entry->l2ad_start, + __entry->l2ad_end, __entry->l2ad_evict, + __entry->l2ad_first, __entry->l2ad_writing, + __entry->taddr, __entry->all) +); + +#define DEFINE_L2ARC_EVICT_EVENT(name) \ +DEFINE_EVENT(zfs_l2arc_evict_class, name, \ + TP_PROTO(l2arc_dev_t *dev, \ + list_t *buflist, uint64_t taddr, boolean_t all), \ + TP_ARGS(dev, buflist, taddr, all)) +DEFINE_L2ARC_EVICT_EVENT(zfs_l2arc__evict); + +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * dmu_tx_t *, ..., + * uint64_t, ..., + * uint64_t, ...); + */ +DECLARE_EVENT_CLASS(zfs_delay_mintime_class, + TP_PROTO(dmu_tx_t *tx, uint64_t dirty, uint64_t min_tx_time), + TP_ARGS(tx, dirty, min_tx_time), + TP_STRUCT__entry( + __field(uint64_t, tx_txg) + __field(uint64_t, tx_lastsnap_txg) + __field(uint64_t, tx_lasttried_txg) + __field(boolean_t, tx_anyobj) + __field(boolean_t, tx_waited) + __field(hrtime_t, tx_start) + __field(boolean_t, tx_wait_dirty) + __field(int, tx_err) +#ifdef DEBUG_DMU_TX + __field(uint64_t, tx_space_towrite) + __field(uint64_t, tx_space_tofree) + __field(uint64_t, tx_space_tooverwrite) + __field(uint64_t, tx_space_tounref) + __field(int64_t, tx_space_written) + __field(int64_t, tx_space_freed) +#endif + __field(uint64_t, min_tx_time) + __field(uint64_t, dirty) + ), + TP_fast_assign( + __entry->tx_txg = tx->tx_txg; + __entry->tx_lastsnap_txg = tx->tx_lastsnap_txg; + __entry->tx_lasttried_txg = tx->tx_lasttried_txg; + __entry->tx_anyobj = tx->tx_anyobj; + __entry->tx_waited = tx->tx_waited; + __entry->tx_start = tx->tx_start; + __entry->tx_wait_dirty = tx->tx_wait_dirty; + __entry->tx_err = tx->tx_err; +#ifdef DEBUG_DMU_TX + __entry->tx_space_towrite = tx->tx_space_towrite; + __entry->tx_space_tofree = tx->tx_space_tofree; + __entry->tx_space_tooverwrite = tx->tx_space_tooverwrite; + __entry->tx_space_tounref = tx->tx_space_tounref; + __entry->tx_space_written = tx->tx_space_written.rc_count; + __entry->tx_space_freed = tx->tx_space_freed.rc_count; +#endif + __entry->dirty = dirty; + __entry->min_tx_time = min_tx_time; + ), + TP_printk("tx { txg %llu lastsnap_txg %llu tx_lasttried_txg %llu " + "anyobj %d waited %d start %llu wait_dirty %d err %i " +#ifdef DEBUG_DMU_TX + "space_towrite %llu space_tofree %llu space_tooverwrite %llu " + "space_tounref %llu space_written %lli space_freed %lli " +#endif + "} dirty %llu min_tx_time %llu", + __entry->tx_txg, __entry->tx_lastsnap_txg, + __entry->tx_lasttried_txg, __entry->tx_anyobj, __entry->tx_waited, + __entry->tx_start, __entry->tx_wait_dirty, __entry->tx_err, +#ifdef DEBUG_DMU_TX + __entry->tx_space_towrite, __entry->tx_space_tofree, + __entry->tx_space_tooverwrite, __entry->tx_space_tounref, + __entry->tx_space_written, __entry->tx_space_freed, +#endif + __entry->dirty, __entry->min_tx_time) +); + +#define DEFINE_DELAY_MINTIME_EVENT(name) \ +DEFINE_EVENT(zfs_delay_mintime_class, name, \ + TP_PROTO(dmu_tx_t *tx, uint64_t dirty, uint64_t min_tx_time), \ + TP_ARGS(tx, dirty, min_tx_time)) +DEFINE_DELAY_MINTIME_EVENT(zfs_delay__mintime); + +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * dnode_t *, ..., + * int64_t, ..., + * uint32_t, ...); + */ +DECLARE_EVENT_CLASS(zfs_dnode_move_class, + TP_PROTO(dnode_t *dn, int64_t refcount, uint32_t dbufs), + TP_ARGS(dn, refcount, dbufs), + TP_STRUCT__entry( + __field(uint64_t, dn_object) + __field(dmu_object_type_t, dn_type) + __field(uint16_t, dn_bonuslen) + __field(uint8_t, dn_bonustype) + __field(uint8_t, dn_nblkptr) + __field(uint8_t, dn_checksum) + __field(uint8_t, dn_compress) + __field(uint8_t, dn_nlevels) + __field(uint8_t, dn_indblkshift) + __field(uint8_t, dn_datablkshift) + __field(uint8_t, dn_moved) + __field(uint16_t, dn_datablkszsec) + __field(uint32_t, dn_datablksz) + __field(uint64_t, dn_maxblkid) + __field(int64_t, dn_tx_holds) + __field(int64_t, dn_holds) + __field(boolean_t, dn_have_spill) + + __field(int64_t, refcount) + __field(uint32_t, dbufs) + ), + TP_fast_assign( + __entry->dn_object = dn->dn_object; + __entry->dn_type = dn->dn_type; + __entry->dn_bonuslen = dn->dn_bonuslen; + __entry->dn_bonustype = dn->dn_bonustype; + __entry->dn_nblkptr = dn->dn_nblkptr; + __entry->dn_checksum = dn->dn_checksum; + __entry->dn_compress = dn->dn_compress; + __entry->dn_nlevels = dn->dn_nlevels; + __entry->dn_indblkshift = dn->dn_indblkshift; + __entry->dn_datablkshift = dn->dn_datablkshift; + __entry->dn_moved = dn->dn_moved; + __entry->dn_datablkszsec = dn->dn_datablkszsec; + __entry->dn_datablksz = dn->dn_datablksz; + __entry->dn_maxblkid = dn->dn_maxblkid; + __entry->dn_tx_holds = dn->dn_tx_holds.rc_count; + __entry->dn_holds = dn->dn_holds.rc_count; + __entry->dn_have_spill = dn->dn_have_spill; + + __entry->refcount = refcount; + __entry->dbufs = dbufs; + ), + TP_printk("dn { object %llu type %d bonuslen %u bonustype %u " + "nblkptr %u checksum %u compress %u nlevels %u indblkshift %u " + "datablkshift %u moved %u datablkszsec %u datablksz %u " + "maxblkid %llu tx_holds %lli holds %lli have_spill %d } " + "refcount %lli dbufs %u", + __entry->dn_object, __entry->dn_type, __entry->dn_bonuslen, + __entry->dn_bonustype, __entry->dn_nblkptr, __entry->dn_checksum, + __entry->dn_compress, __entry->dn_nlevels, __entry->dn_indblkshift, + __entry->dn_datablkshift, __entry->dn_moved, + __entry->dn_datablkszsec, __entry->dn_datablksz, + __entry->dn_maxblkid, __entry->dn_tx_holds, __entry->dn_holds, + __entry->dn_have_spill, __entry->refcount, __entry->dbufs) +); + +#define DEFINE_DNODE_MOVE_EVENT(name) \ +DEFINE_EVENT(zfs_dnode_move_class, name, \ + TP_PROTO(dnode_t *dn, int64_t refcount, uint32_t dbufs), \ + TP_ARGS(dn, refcount, dbufs)) +DEFINE_DNODE_MOVE_EVENT(zfs_dnode__move); + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * dsl_pool_t *, ..., + * uint64_t, ...); + */ +DECLARE_EVENT_CLASS(zfs_txg_class, + TP_PROTO(dsl_pool_t *dp, uint64_t txg), + TP_ARGS(dp, txg), + TP_STRUCT__entry( + __field(uint64_t, txg) + ), + TP_fast_assign( + __entry->txg = txg; + ), + TP_printk("txg %llu", __entry->txg) +); + +#define DEFINE_TXG_EVENT(name) \ +DEFINE_EVENT(zfs_txg_class, name, \ + TP_PROTO(dsl_pool_t *dp, uint64_t txg), \ + TP_ARGS(dp, txg)) +DEFINE_TXG_EVENT(zfs_dsl_pool_sync__done); +DEFINE_TXG_EVENT(zfs_txg__quiescing); +DEFINE_TXG_EVENT(zfs_txg__opened); +DEFINE_TXG_EVENT(zfs_txg__syncing); +DEFINE_TXG_EVENT(zfs_txg__synced); +DEFINE_TXG_EVENT(zfs_txg__quiesced); + +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * znode_t *, ..., + * zfs_ace_hdr_t *, ..., + * uint32_t, ...); + */ +DECLARE_EVENT_CLASS(zfs_ace_class, + TP_PROTO(znode_t *zn, zfs_ace_hdr_t *ace, uint32_t mask_matched), + TP_ARGS(zn, ace, mask_matched), + TP_STRUCT__entry( + __field(uint64_t, z_id) + __field(uint8_t, z_unlinked) + __field(uint8_t, z_atime_dirty) + __field(uint8_t, z_zn_prefetch) + __field(uint8_t, z_moved) + __field(uint_t, z_blksz) + __field(uint_t, z_seq) + __field(uint64_t, z_mapcnt) + __field(uint64_t, z_gen) + __field(uint64_t, z_size) + __array(uint64_t, z_atime, 2) + __field(uint64_t, z_links) + __field(uint64_t, z_pflags) + __field(uint64_t, z_uid) + __field(uint64_t, z_gid) + __field(uint32_t, z_sync_cnt) + __field(mode_t, z_mode) + __field(boolean_t, z_is_sa) + __field(boolean_t, z_is_zvol) + __field(boolean_t, z_is_mapped) + __field(boolean_t, z_is_ctldir) + __field(boolean_t, z_is_stale) + + __field(unsigned long, i_ino) + __field(unsigned int, i_nlink) + __field(u64, i_version) + __field(loff_t, i_size) + __field(unsigned int, i_blkbits) + __field(unsigned short, i_bytes) + __field(umode_t, i_mode) + __field(__u32, i_generation) + + __field(uint16_t, z_type) + __field(uint16_t, z_flags) + __field(uint32_t, z_access_mask) + + __field(uint32_t, mask_matched) + ), + TP_fast_assign( + __entry->z_id = zn->z_id; + __entry->z_unlinked = zn->z_unlinked; + __entry->z_atime_dirty = zn->z_atime_dirty; + __entry->z_zn_prefetch = zn->z_zn_prefetch; + __entry->z_moved = zn->z_moved; + __entry->z_blksz = zn->z_blksz; + __entry->z_seq = zn->z_seq; + __entry->z_mapcnt = zn->z_mapcnt; + __entry->z_gen = zn->z_gen; + __entry->z_size = zn->z_size; + __entry->z_atime[0] = zn->z_atime[0]; + __entry->z_atime[1] = zn->z_atime[1]; + __entry->z_links = zn->z_links; + __entry->z_pflags = zn->z_pflags; + __entry->z_uid = zn->z_uid; + __entry->z_gid = zn->z_gid; + __entry->z_sync_cnt = zn->z_sync_cnt; + __entry->z_mode = zn->z_mode; + __entry->z_is_sa = zn->z_is_sa; + __entry->z_is_zvol = zn->z_is_zvol; + __entry->z_is_mapped = zn->z_is_mapped; + __entry->z_is_ctldir = zn->z_is_ctldir; + __entry->z_is_stale = zn->z_is_stale; + + __entry->i_ino = zn->z_inode.i_ino; + __entry->i_nlink = zn->z_inode.i_nlink; + __entry->i_version = zn->z_inode.i_version; + __entry->i_size = zn->z_inode.i_size; + __entry->i_blkbits = zn->z_inode.i_blkbits; + __entry->i_bytes = zn->z_inode.i_bytes; + __entry->i_mode = zn->z_inode.i_mode; + __entry->i_generation = zn->z_inode.i_generation; + + __entry->z_type = ace->z_type; + __entry->z_flags = ace->z_flags; + __entry->z_access_mask = ace->z_access_mask; + + __entry->mask_matched = mask_matched; + ), + TP_printk("zn { id %llu unlinked %u atime_dirty %u " + "zn_prefetch %u moved %u blksz %u seq %u " + "mapcnt %llu gen %llu size %llu atime 0x%llx:0x%llx " + "links %llu pflags %llu uid %llu gid %llu " + "sync_cnt %u mode 0x%x is_sa %d is_zvol %d " + "is_mapped %d is_ctldir %d is_stale %d inode { " + "ino %lu nlink %u version %llu size %lli blkbits %u " + "bytes %u mode 0x%x generation %x } } ace { type %u " + "flags %u access_mask %u } mask_matched %u", + __entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty, + __entry->z_zn_prefetch, __entry->z_moved, __entry->z_blksz, + __entry->z_seq, __entry->z_mapcnt, __entry->z_gen, + __entry->z_size, __entry->z_atime[0], __entry->z_atime[1], + __entry->z_links, __entry->z_pflags, __entry->z_uid, + __entry->z_gid, __entry->z_sync_cnt, __entry->z_mode, + __entry->z_is_sa, __entry->z_is_zvol, __entry->z_is_mapped, + __entry->z_is_ctldir, __entry->z_is_stale, __entry->i_ino, + __entry->i_nlink, __entry->i_version, __entry->i_size, + __entry->i_blkbits, __entry->i_bytes, __entry->i_mode, + __entry->i_generation, __entry->z_type, __entry->z_flags, + __entry->z_access_mask, __entry->mask_matched) +); + +#define DEFINE_ACE_EVENT(name) \ +DEFINE_EVENT(zfs_ace_class, name, \ + TP_PROTO(znode_t *zn, zfs_ace_hdr_t *ace, uint32_t mask_matched), \ + TP_ARGS(zn, ace, mask_matched)) +DEFINE_ACE_EVENT(zfs_zfs__ace__denies); +DEFINE_ACE_EVENT(zfs_zfs__ace__allows); + +/* + * Generic support for one argument tracepoints of the form: + * + * DTRACE_PROBE1(..., + * zilog_t *, ...); + */ +DECLARE_EVENT_CLASS(zfs_zil_class, + TP_PROTO(zilog_t *zilog), + TP_ARGS(zilog), + TP_STRUCT__entry( + __field(uint64_t, zl_lr_seq) + __field(uint64_t, zl_commit_lr_seq) + __field(uint64_t, zl_destroy_txg) + __field(uint64_t, zl_replaying_seq) + __field(uint32_t, zl_suspend) + __field(uint8_t, zl_suspending) + __field(uint8_t, zl_keep_first) + __field(uint8_t, zl_replay) + __field(uint8_t, zl_stop_sync) + __field(uint8_t, zl_writer) + __field(uint8_t, zl_logbias) + __field(uint8_t, zl_sync) + __field(int, zl_parse_error) + __field(uint64_t, zl_parse_blk_seq) + __field(uint64_t, zl_parse_lr_seq) + __field(uint64_t, zl_parse_blk_count) + __field(uint64_t, zl_parse_lr_count) + __field(uint64_t, zl_next_batch) + __field(uint64_t, zl_com_batch) + __field(uint64_t, zl_itx_list_sz) + __field(uint64_t, zl_cur_used) + __field(clock_t, zl_replay_time) + __field(uint64_t, zl_replay_blks) + ), + TP_fast_assign( + __entry->zl_lr_seq = zilog->zl_lr_seq; + __entry->zl_commit_lr_seq = zilog->zl_commit_lr_seq; + __entry->zl_destroy_txg = zilog->zl_destroy_txg; + __entry->zl_replaying_seq = zilog->zl_replaying_seq; + __entry->zl_suspend = zilog->zl_suspend; + __entry->zl_suspending = zilog->zl_suspending; + __entry->zl_keep_first = zilog->zl_keep_first; + __entry->zl_replay = zilog->zl_replay; + __entry->zl_stop_sync = zilog->zl_stop_sync; + __entry->zl_writer = zilog->zl_writer; + __entry->zl_logbias = zilog->zl_logbias; + __entry->zl_sync = zilog->zl_sync; + __entry->zl_parse_error = zilog->zl_parse_error; + __entry->zl_parse_blk_seq = zilog->zl_parse_blk_seq; + __entry->zl_parse_lr_seq = zilog->zl_parse_lr_seq; + __entry->zl_parse_blk_count = zilog->zl_parse_blk_count; + __entry->zl_parse_lr_count = zilog->zl_parse_lr_count; + __entry->zl_next_batch = zilog->zl_next_batch; + __entry->zl_com_batch = zilog->zl_com_batch; + __entry->zl_itx_list_sz = zilog->zl_itx_list_sz; + __entry->zl_cur_used = zilog->zl_cur_used; + __entry->zl_replay_time = zilog->zl_replay_time; + __entry->zl_replay_blks = zilog->zl_replay_blks; + ), + TP_printk("zl { lr_seq %llu commit_lr_seq %llu destroy_txg %llu " + "replaying_seq %llu suspend %u suspending %u keep_first %u " + "replay %u stop_sync %u writer %u logbias %u sync %u " + "parse_error %u parse_blk_seq %llu parse_lr_seq %llu " + "parse_blk_count %llu parse_lr_count %llu next_batch %llu " + "com_batch %llu itx_list_sz %llu cur_used %llu replay_time %lu " + "replay_blks %llu }", + __entry->zl_lr_seq, __entry->zl_commit_lr_seq, + __entry->zl_destroy_txg, __entry->zl_replaying_seq, + __entry->zl_suspend, __entry->zl_suspending, __entry->zl_keep_first, + __entry->zl_replay, __entry->zl_stop_sync, __entry->zl_writer, + __entry->zl_logbias, __entry->zl_sync, __entry->zl_parse_error, + __entry->zl_parse_blk_seq, __entry->zl_parse_lr_seq, + __entry->zl_parse_blk_count, __entry->zl_parse_lr_count, + __entry->zl_next_batch, __entry->zl_com_batch, + __entry->zl_itx_list_sz, __entry->zl_cur_used, + __entry->zl_replay_time, __entry->zl_replay_blks) +); + +#define DEFINE_ZIL_EVENT(name) \ +DEFINE_EVENT(zfs_zil_class, name, \ + TP_PROTO(zilog_t *zilog), \ + TP_ARGS(zilog)) +DEFINE_ZIL_EVENT(zfs_zil__cw1); +DEFINE_ZIL_EVENT(zfs_zil__cw2); + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * dmu_buf_impl_t *, ..., + * zio_t *, ...); + */ +#define DBUF_TP_STRUCT_ENTRY \ + __field(const char *, os_spa) \ + __field(uint64_t, ds_object) \ + __field(uint64_t, db_object) \ + __field(uint64_t, db_level) \ + __field(uint64_t, db_blkid) \ + __field(uint64_t, db_offset) \ + __field(uint64_t, db_size) \ + __field(uint64_t, db_state) \ + __field(int64_t, db_holds) \ + +#define DBUF_TP_FAST_ASSIGN \ + __entry->os_spa = \ + spa_name(DB_DNODE(db)->dn_objset->os_spa); \ + \ + __entry->ds_object = db->db_objset->os_dsl_dataset ? \ + db->db_objset->os_dsl_dataset->ds_object : 0; \ + \ + __entry->db_object = db->db.db_object; \ + __entry->db_level = db->db_level; \ + __entry->db_blkid = db->db_blkid; \ + __entry->db_offset = db->db.db_offset; \ + __entry->db_size = db->db.db_size; \ + __entry->db_state = db->db_state; \ + __entry->db_holds = refcount_count(&db->db_holds); + +#define DBUF_TP_PRINTK_FMT \ + "dbuf { spa \"%s\" objset %llu object %llu level %llu " \ + "blkid %llu offset %llu size %llu state %llu holds %lld }" + +#define DBUF_TP_PRINTK_ARGS \ + __entry->os_spa, __entry->ds_object, \ + __entry->db_object, __entry->db_level, \ + __entry->db_blkid, __entry->db_offset, \ + __entry->db_size, __entry->db_state, __entry->db_holds + +DECLARE_EVENT_CLASS(zfs_dbuf_class, + TP_PROTO(dmu_buf_impl_t *db, zio_t *zio), + TP_ARGS(db, zio), + TP_STRUCT__entry(DBUF_TP_STRUCT_ENTRY), + TP_fast_assign(DBUF_TP_FAST_ASSIGN), + TP_printk(DBUF_TP_PRINTK_FMT, DBUF_TP_PRINTK_ARGS) +); + +#define DEFINE_DBUF_EVENT(name) \ +DEFINE_EVENT(zfs_dbuf_class, name, \ + TP_PROTO(dmu_buf_impl_t *db, zio_t *zio), \ + TP_ARGS(db, zio)) +DEFINE_DBUF_EVENT(zfs_blocked__read); + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * zrlock_t *, ..., + * uint32_t, ...); + */ +DECLARE_EVENT_CLASS(zfs_zrlock_class, + TP_PROTO(zrlock_t *zrl, uint32_t n), + TP_ARGS(zrl, n), + TP_STRUCT__entry( + __field(int32_t, zr_refcount) +#ifdef ZFS_DEBUG + __field(pid_t, zr_owner_pid) + __field(const char *, zr_caller) +#endif + __field(uint32_t, n) + ), + TP_fast_assign( + __entry->zr_refcount = zrl->zr_refcount; +#ifdef ZFS_DEBUG + __entry->zr_owner_pid = zrl->zr_owner->pid; + __entry->zr_caller = zrl->zr_caller; +#endif + __entry->n = n; + ), +#ifdef ZFS_DEBUG + TP_printk("zrl { refcount %d owner_pid %d caller %s } n %u", + __entry->zr_refcount, __entry->zr_owner_pid, __entry->zr_caller, + __entry->n) +#else + TP_printk("zrl { refcount %d } n %u", + __entry->zr_refcount, __entry->n) +#endif +); + +#define DEFINE_ZRLOCK_EVENT(name) \ +DEFINE_EVENT(zfs_zrlock_class, name, \ + TP_PROTO(zrlock_t *zrl, uint32_t n), \ + TP_ARGS(zrl, n)) +DEFINE_ZRLOCK_EVENT(zfs_zrlock__reentry); + +/* + * Generic support for four argument tracepoints of the form: + * + * DTRACE_PROBE4(..., + * const char *, ..., + * const char *, ..., + * int, ..., + * uintptr_t, ...); + */ +DECLARE_EVENT_CLASS(zfs_set_error_class, + TP_PROTO(const char *file, const char *function, int line, + uintptr_t error), + TP_ARGS(file, function, line, error), + TP_STRUCT__entry( + __field(const char *, file) + __field(const char *, function) + __field(int, line) + __field(uintptr_t, error) + ), + TP_fast_assign( + __entry->file = strchr(file, '/') ? strrchr(file, '/') + 1 : file; + __entry->function = function; + __entry->line = line; + __entry->error = error; + ), + TP_printk("%s:%d:%s(): error 0x%lx", __entry->file, __entry->line, + __entry->function, __entry->error) +); + +#define DEFINE_SET_ERROR_EVENT(name) \ +DEFINE_EVENT(zfs_set_error_class, name, \ + TP_PROTO(const char *file, const char *function, int line, \ + uintptr_t error), \ + TP_ARGS(file, function, line, error)) +DEFINE_SET_ERROR_EVENT(zfs_set__error); + +/* + * Generic support for four argument tracepoints of the form: + * + * DTRACE_PROBE4(..., + * const char *, ..., + * const char *, ..., + * int, ..., + * const char *, ...); + */ +DECLARE_EVENT_CLASS(zfs_dprintf_class, + TP_PROTO(const char *file, const char *function, int line, + const char *msg), + TP_ARGS(file, function, line, msg), + TP_STRUCT__entry( + __field(const char *, file) + __field(const char *, function) + __field(int, line) + __string(msg, msg) + ), + TP_fast_assign( + __entry->file = file; + __entry->function = function; + __entry->line = line; + __assign_str(msg, msg); + ), + TP_printk("%s:%d:%s(): %s", __entry->file, __entry->line, + __entry->function, __get_str(msg)) +); + +#define DEFINE_DPRINTF_EVENT(name) \ +DEFINE_EVENT(zfs_dprintf_class, name, \ + TP_PROTO(const char *file, const char *function, int line, \ + const char *msg), \ + TP_ARGS(file, function, line, msg)) +DEFINE_DPRINTF_EVENT(zfs_zfs__dprintf); + +/* + * Generic support for one argument tracepoints of the form: + * + * DTRACE_PROBE1(..., + * const char *, ...); + */ +DECLARE_EVENT_CLASS(zfs_dbgmsg_class, + TP_PROTO(const char *msg), + TP_ARGS(msg), + TP_STRUCT__entry( + __string(msg, msg) + ), + TP_fast_assign( + __assign_str(msg, msg); + ), + TP_printk("%s", __get_str(msg)) +); + +#define DEFINE_DBGMSG_EVENT(name) \ +DEFINE_EVENT(zfs_dbgmsg_class, name, \ + TP_PROTO(const char *msg), \ + TP_ARGS(msg)) +DEFINE_DBGMSG_EVENT(zfs_zfs__dbgmsg); + +#endif /* _TRACE_ZFS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace +#include + +#endif /* _KERNEL && HAVE_DECLARE_EVENT_CLASS */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index ee8221d534dd..d4c6fb810b5b 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -140,15 +141,12 @@ #define CE_PANIC 3 /* panic */ #define CE_IGNORE 4 /* print nothing */ -extern int aok; - /* * ZFS debugging */ extern void dprintf_setup(int *argc, char **argv); -extern void __dprintf(const char *file, const char *func, - int line, const char *fmt, ...); + extern void cmn_err(int, const char *, ...); extern void vcmn_err(int, const char *, va_list); extern void panic(const char *, ...); @@ -156,7 +154,8 @@ extern void vpanic(const char *, va_list); #define fm_panic panic -#ifdef __sun +extern int aok; + /* * DTrace SDT probes have different signatures in userland than they do in * kernel. If they're being used in kernel code, re-define them out of @@ -202,9 +201,6 @@ extern void vpanic(const char *, va_list); * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice). */ #define SET_ERROR(err) (ZFS_SET_ERROR(err), err) -#else -#define SET_ERROR(err) (err) -#endif /* * Threads. TS_STACK_MIN is dictated by the minimum allowed pthread stack diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index 35ffa0187c79..1a7062408e04 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -38,14 +38,6 @@ extern "C" { #define FALSE 0 #endif -/* - * ZFS debugging - Always enabled for user space builds. - */ - -#if !defined(ZFS_DEBUG) && !defined(_KERNEL) -#define ZFS_DEBUG -#endif - extern int zfs_flags; extern int zfs_recover; extern int zfs_free_leak_on_eio; @@ -59,29 +51,15 @@ extern int zfs_free_leak_on_eio; #define ZFS_DEBUG_ZIO_FREE (1<<6) #define ZFS_DEBUG_HISTOGRAM_VERIFY (1<<7) -/* - * Always log zfs debug messages to the spl debug subsystem as SS_USER1. - * When the SPL is configured with debugging enabled these messages will - * appear in the internal spl debug log, otherwise they are a no-op. - */ -#if defined(_KERNEL) - -#include -#define dprintf(...) \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) \ - __SDEBUG(NULL, SS_USER1, SD_DPRINTF, __VA_ARGS__) - -/* - * When zfs is running is user space the debugging is always enabled. - * The messages will be printed using the __dprintf() function and - * filtered based on the zfs_flags variable. - */ -#else -#define dprintf(...) \ - if (zfs_flags & ZFS_DEBUG_DPRINTF) \ +#if defined(HAVE_DECLARE_EVENT_CLASS) || !defined(_KERNEL) +extern void __dprintf(const char *file, const char *func, + int line, const char *fmt, ...); +#define dprintf(...) \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) \ __dprintf(__FILE__, __func__, __LINE__, __VA_ARGS__) - -#endif /* _KERNEL */ +#else +#define dprintf(...) ((void)0) +#endif /* HAVE_DECLARE_EVENT_CLASS || !_KERNEL */ extern void zfs_panic_recover(const char *fmt, ...); @@ -93,12 +71,8 @@ typedef struct zfs_dbgmsg { extern void zfs_dbgmsg_init(void); extern void zfs_dbgmsg_fini(void); -#if defined(_KERNEL) && defined(__linux__) -#define zfs_dbgmsg(...) dprintf(__VA_ARGS__) -#else extern void zfs_dbgmsg(const char *fmt, ...); extern void zfs_dbgmsg_print(const char *tag); -#endif #ifndef _KERNEL extern int dprintf_find_string(const char *string); diff --git a/lib/libspl/include/sys/Makefile.am b/lib/libspl/include/sys/Makefile.am index d86cc6a5601a..8545f54ea180 100644 --- a/lib/libspl/include/sys/Makefile.am +++ b/lib/libspl/include/sys/Makefile.am @@ -33,7 +33,6 @@ libspl_HEADERS = \ $(top_srcdir)/lib/libspl/include/sys/param.h \ $(top_srcdir)/lib/libspl/include/sys/priv.h \ $(top_srcdir)/lib/libspl/include/sys/processor.h \ - $(top_srcdir)/lib/libspl/include/sys/sdt.h \ $(top_srcdir)/lib/libspl/include/sys/stack.h \ $(top_srcdir)/lib/libspl/include/sys/stat.h \ $(top_srcdir)/lib/libspl/include/sys/stropts.h \ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index f4838da75fd0..85bc0510a81d 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -70,6 +70,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/space_map.c \ $(top_srcdir)/module/zfs/space_reftree.c \ $(top_srcdir)/module/zfs/txg.c \ + $(top_srcdir)/module/zfs/trace.c \ $(top_srcdir)/module/zfs/uberblock.c \ $(top_srcdir)/module/zfs/unique.c \ $(top_srcdir)/module/zfs/vdev.c \ diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 48e7e97e9814..954841f33137 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -52,6 +52,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o $(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o $(MODULE)-objs += @top_srcdir@/module/zfs/space_reftree.o $(MODULE)-objs += @top_srcdir@/module/zfs/txg.o +$(MODULE)-objs += @top_srcdir@/module/zfs/trace.o $(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o $(MODULE)-objs += @top_srcdir@/module/zfs/unique.o $(MODULE)-objs += @top_srcdir@/module/zfs/vdev.o diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 0742da0f6f40..fd593a13e8ef 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -145,6 +145,7 @@ #include #include #include +#include #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ @@ -218,46 +219,6 @@ unsigned long zfs_arc_max = 0; unsigned long zfs_arc_min = 0; unsigned long zfs_arc_meta_limit = 0; -/* - * Note that buffers can be in one of 6 states: - * ARC_anon - anonymous (discussed below) - * ARC_mru - recently used, currently cached - * ARC_mru_ghost - recentely used, no longer in cache - * ARC_mfu - frequently used, currently cached - * ARC_mfu_ghost - frequently used, no longer in cache - * ARC_l2c_only - exists in L2ARC but not other states - * When there are no active references to the buffer, they are - * are linked onto a list in one of these arc states. These are - * the only buffers that can be evicted or deleted. Within each - * state there are multiple lists, one for meta-data and one for - * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, - * etc.) is tracked separately so that it can be managed more - * explicitly: favored over data, limited explicitly. - * - * Anonymous buffers are buffers that are not associated with - * a DVA. These are buffers that hold dirty block copies - * before they are written to stable storage. By definition, - * they are "ref'd" and are considered part of arc_mru - * that cannot be freed. Generally, they will aquire a DVA - * as they are written and migrate onto the arc_mru list. - * - * The ARC_l2c_only state is for buffers that are in the second - * level ARC but no longer in any of the ARC_m* lists. The second - * level ARC itself may also contain buffers that are in any of - * the ARC_m* states - meaning that a buffer can exist in two - * places. The reason for the ARC_l2c_only state is to keep the - * buffer header in the hash table, so that reads that hit the - * second level ARC benefit from these fast lookups. - */ - -typedef struct arc_state { - list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ - uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ - uint64_t arcs_size; /* total amount of data in this state */ - kmutex_t arcs_mtx; - arc_state_type_t arcs_state; -} arc_state_t; - /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; @@ -522,69 +483,6 @@ static arc_state_t *arc_l2c_only; #define L2ARC_IS_VALID_COMPRESS(_c_) \ ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) -typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; - -typedef struct arc_callback arc_callback_t; - -struct arc_callback { - void *acb_private; - arc_done_func_t *acb_done; - arc_buf_t *acb_buf; - zio_t *acb_zio_dummy; - arc_callback_t *acb_next; -}; - -typedef struct arc_write_callback arc_write_callback_t; - -struct arc_write_callback { - void *awcb_private; - arc_done_func_t *awcb_ready; - arc_done_func_t *awcb_physdone; - arc_done_func_t *awcb_done; - arc_buf_t *awcb_buf; -}; - -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - uint64_t b_cksum0; - - kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; - - arc_buf_hdr_t *b_hash_next; - arc_buf_t *b_buf; - uint32_t b_flags; - uint32_t b_datacnt; - - arc_callback_t *b_acb; - kcondvar_t b_cv; - - /* immutable */ - arc_buf_contents_t b_type; - uint64_t b_size; - uint64_t b_spa; - - /* protected by arc state mutex */ - arc_state_t *b_state; - list_node_t b_arc_node; - - /* updated atomically */ - clock_t b_arc_access; - uint32_t b_mru_hits; - uint32_t b_mru_ghost_hits; - uint32_t b_mfu_hits; - uint32_t b_mfu_ghost_hits; - uint32_t b_l2_hits; - - /* self protecting */ - refcount_t b_refcnt; - - l2arc_buf_hdr_t *b_l2hdr; - list_node_t b_l2node; -}; - static list_t arc_prune_list; static kmutex_t arc_prune_mtx; static arc_buf_t *arc_eviction_list; @@ -707,19 +605,6 @@ int l2arc_norw = B_FALSE; /* no reads during writes */ /* * L2ARC Internals */ -typedef struct l2arc_dev { - vdev_t *l2ad_vdev; /* vdev */ - spa_t *l2ad_spa; /* spa */ - uint64_t l2ad_hand; /* next write location */ - uint64_t l2ad_start; /* first addr on device */ - uint64_t l2ad_end; /* last addr on device */ - uint64_t l2ad_evict; /* last addr eviction reached */ - boolean_t l2ad_first; /* first sweep through */ - boolean_t l2ad_writing; /* currently writing */ - list_t *l2ad_buflist; /* buffer list */ - list_node_t l2ad_node; /* device list node */ -} l2arc_dev_t; - static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ @@ -2043,7 +1928,7 @@ arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, if (bytes_evicted < bytes) dprintf("only evicted %lld bytes from %x\n", - (longlong_t)bytes_evicted, state); + (longlong_t)bytes_evicted, state->arcs_state); if (skipped) ARCSTAT_INCR(arcstat_evict_skip, skipped); diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c index 61fa17f6ee9f..0eafa4d7d8ae 100644 --- a/module/zfs/bptree.c +++ b/module/zfs/bptree.c @@ -221,7 +221,7 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, if (zfs_free_leak_on_eio) flags |= TRAVERSE_HARD; - zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld " + zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld " "bookmark %lld/%lld/%lld/%lld", i, (longlong_t)bte.be_birth_txg, (longlong_t)bte.be_zb.zb_objset, diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index bb740e569544..2d16d7e06e6b 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -238,6 +238,53 @@ static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; int spa_mode_global; +#ifdef ZFS_DEBUG +/* Everything except dprintf and spa is on by default in debug builds */ +int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA); +#else +int zfs_flags = 0; +#endif + +/* + * zfs_recover can be set to nonzero to attempt to recover from + * otherwise-fatal errors, typically caused by on-disk corruption. When + * set, calls to zfs_panic_recover() will turn into warning messages. + * This should only be used as a last resort, as it typically results + * in leaked space, or worse. + */ +int zfs_recover = B_FALSE; + +/* + * If destroy encounters an EIO while reading metadata (e.g. indirect + * blocks), space referenced by the missing metadata can not be freed. + * Normally this causes the background destroy to become "stalled", as + * it is unable to make forward progress. While in this stalled state, + * all remaining space to free from the error-encountering filesystem is + * "temporarily leaked". Set this flag to cause it to ignore the EIO, + * permanently leak the space from indirect blocks that can not be read, + * and continue to free everything else that it can. + * + * The default, "stalling" behavior is useful if the storage partially + * fails (i.e. some but not all i/os fail), and then later recovers. In + * this case, we will be able to continue pool operations while it is + * partially failed, and when it recovers, we can continue to free the + * space, with no leaks. However, note that this case is actually + * fairly rare. + * + * Typically pools either (a) fail completely (but perhaps temporarily, + * e.g. a top-level vdev going offline), or (b) have localized, + * permanent errors (e.g. disk returns the wrong data due to bit flip or + * firmware bug). In case (a), this setting does not matter because the + * pool will be suspended and the sync thread will not be able to make + * forward progress regardless. In case (b), because the error is + * permanent, the best we can do is leak the minimum amount of space, + * which is what setting this flag will do. Therefore, it is reasonable + * for this flag to normally be set, but we chose the more conservative + * approach of not setting it, so that there is no possibility of + * leaking space in the "partial temporary" failure case. + */ +int zfs_free_leak_on_eio = B_FALSE; + /* * Expiration time in milliseconds. This value has two meanings. First it is * used to determine when the spa_deadman() logic should fire. By default the @@ -1319,6 +1366,16 @@ spa_freeze(spa_t *spa) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } +void +zfs_panic_recover(const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); + va_end(adx); +} + /* * This is a stripped-down version of strtoull, suitable only for converting * lowercase hexadecimal numbers that don't overflow. @@ -1923,6 +1980,16 @@ EXPORT_SYMBOL(spa_mode); EXPORT_SYMBOL(spa_namespace_lock); +module_param(zfs_flags, int, 0644); +MODULE_PARM_DESC(zfs_flags, "Set additional debugging flags"); + +module_param(zfs_recover, int, 0644); +MODULE_PARM_DESC(zfs_recover, "Set to attempt to recover from fatal errors"); + +module_param(zfs_free_leak_on_eio, int, 0644); +MODULE_PARM_DESC(zfs_free_leak_on_eio, + "Set to ignore IO errors during free and permanently leak the space"); + module_param(zfs_deadman_synctime_ms, ulong, 0644); MODULE_PARM_DESC(zfs_deadman_synctime_ms, "Expiration time in milliseconds"); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index 0ceccf669618..fc0df756e6aa 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -427,7 +427,7 @@ space_map_truncate(space_map_t *sm, dmu_tx_t *tx) doi.doi_bonus_size != sizeof (space_map_phys_t)) || doi.doi_data_block_size != space_map_blksz) { zfs_dbgmsg("txg %llu, spa %s, reallocating: " - "old bonus %u, old blocksz %u", dmu_tx_get_txg(tx), + "old bonus %llu, old blocksz %u", dmu_tx_get_txg(tx), spa_name(spa), doi.doi_bonus_size, doi.doi_data_block_size); space_map_free(sm, tx); diff --git a/lib/libspl/include/sys/sdt.h b/module/zfs/trace.c similarity index 53% rename from lib/libspl/include/sys/sdt.h rename to module/zfs/trace.c index f68f79040b24..54ca2b2b492b 100644 --- a/lib/libspl/include/sys/sdt.h +++ b/module/zfs/trace.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,17 +19,21 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Each Linux tracepoints subsystem must define CREATE_TRACE_POINTS in one + * (and only one) C file, so this dummy file exists for that purpose. */ -#ifndef _LIBSPL_SYS_SDT_H -#define _LIBSPL_SYS_SDT_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#define DTRACE_PROBE(a) ((void) 0) -#define DTRACE_PROBE1(a, b, c) ((void) 0) -#define DTRACE_PROBE2(a, b, c, d, e) ((void) 0) -#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void) 0) -#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void) 0) - -#endif +#define CREATE_TRACE_POINTS +#include diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 02c92748f6c0..4ee75b6d9229 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -209,7 +209,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, tbl->zt_nextblk = 0; tbl->zt_blks_copied = 0; - dprintf("finished; numblocks now %llu (%lluk entries)\n", + dprintf("finished; numblocks now %llu (%uk entries)\n", tbl->zt_numblks, 1<<(tbl->zt_shift-10)); } diff --git a/module/zfs/zfs_debug.c b/module/zfs/zfs_debug.c index e7f266b5fab8..e1675c818a7d 100644 --- a/module/zfs/zfs_debug.c +++ b/module/zfs/zfs_debug.c @@ -25,99 +25,22 @@ #include -#if !defined(_KERNEL) || !defined(__linux__) list_t zfs_dbgmsgs; int zfs_dbgmsg_size; kmutex_t zfs_dbgmsgs_lock; int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ -#endif -/* - * Enable various debugging features. - */ -int zfs_flags = 0; - -/* - * zfs_recover can be set to nonzero to attempt to recover from - * otherwise-fatal errors, typically caused by on-disk corruption. When - * set, calls to zfs_panic_recover() will turn into warning messages. - * This should only be used as a last resort, as it typically results - * in leaked space, or worse. - */ -int zfs_recover = B_FALSE; - -/* - * If destroy encounters an EIO while reading metadata (e.g. indirect - * blocks), space referenced by the missing metadata can not be freed. - * Normally this causes the background destroy to become "stalled", as - * it is unable to make forward progress. While in this stalled state, - * all remaining space to free from the error-encountering filesystem is - * "temporarily leaked". Set this flag to cause it to ignore the EIO, - * permanently leak the space from indirect blocks that can not be read, - * and continue to free everything else that it can. - * - * The default, "stalling" behavior is useful if the storage partially - * fails (i.e. some but not all i/os fail), and then later recovers. In - * this case, we will be able to continue pool operations while it is - * partially failed, and when it recovers, we can continue to free the - * space, with no leaks. However, note that this case is actually - * fairly rare. - * - * Typically pools either (a) fail completely (but perhaps temporarily, - * e.g. a top-level vdev going offline), or (b) have localized, - * permanent errors (e.g. disk returns the wrong data due to bit flip or - * firmware bug). In case (a), this setting does not matter because the - * pool will be suspended and the sync thread will not be able to make - * forward progress regardless. In case (b), because the error is - * permanent, the best we can do is leak the minimum amount of space, - * which is what setting this flag will do. Therefore, it is reasonable - * for this flag to normally be set, but we chose the more conservative - * approach of not setting it, so that there is no possibility of - * leaking space in the "partial temporary" failure case. - */ -int zfs_free_leak_on_eio = B_FALSE; - - -void -zfs_panic_recover(const char *fmt, ...) -{ - va_list adx; - - va_start(adx, fmt); - vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); - va_end(adx); -} - -/* - * Debug logging is enabled by default for production kernel builds. - * The overhead for this is negligible and the logs can be valuable when - * debugging. For non-production user space builds all debugging except - * logging is enabled since performance is no longer a concern. - */ void zfs_dbgmsg_init(void) { -#if !defined(_KERNEL) || !defined(__linux__) list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), offsetof(zfs_dbgmsg_t, zdm_node)); mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); -#endif - - if (zfs_flags == 0) { -#if defined(_KERNEL) - zfs_flags = ZFS_DEBUG_DPRINTF; - spl_debug_set_mask(spl_debug_get_mask() | SD_DPRINTF); - spl_debug_set_subsys(spl_debug_get_subsys() | SS_USER1); -#else - zfs_flags = ~ZFS_DEBUG_DPRINTF; -#endif /* _KERNEL */ - } } void zfs_dbgmsg_fini(void) { -#if !defined(_KERNEL) || !defined(__linux__) zfs_dbgmsg_t *zdm; while ((zdm = list_remove_head(&zfs_dbgmsgs)) != NULL) { @@ -127,25 +50,24 @@ zfs_dbgmsg_fini(void) } mutex_destroy(&zfs_dbgmsgs_lock); ASSERT0(zfs_dbgmsg_size); -#endif } -#if !defined(_KERNEL) || !defined(__linux__) /* - * Print these messages by running: - * echo ::zfs_dbgmsg | mdb -k + * To get this data enable the zfs__dbgmsg tracepoint as shown: * - * Monitor these messages by running: - * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + * # Enable zfs__dbgmsg tracepoint, clear the tracepoint ring buffer + * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable + * $ echo 0 > /sys/kernel/debug/tracing/trace * - * When used with libzpool, monitor with: - * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}' + * # Dump the ring buffer. + * $ cat /sys/kernel/debug/tracing/trace */ void zfs_dbgmsg(const char *fmt, ...) { int size; va_list adx; + char *nl; zfs_dbgmsg_t *zdm; va_start(adx, fmt); @@ -156,13 +78,20 @@ zfs_dbgmsg(const char *fmt, ...) * There is one byte of string in sizeof (zfs_dbgmsg_t), used * for the terminating null. */ - zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_SLEEP); + zdm = kmem_alloc(sizeof (zfs_dbgmsg_t) + size, KM_PUSHPAGE); zdm->zdm_timestamp = gethrestime_sec(); va_start(adx, fmt); (void) vsnprintf(zdm->zdm_msg, size + 1, fmt, adx); va_end(adx); + /* + * Get rid of trailing newline. + */ + nl = strrchr(zdm->zdm_msg, '\n'); + if (nl != NULL) + *nl = '\0'; + DTRACE_PROBE1(zfs__dbgmsg, char *, zdm->zdm_msg); mutex_enter(&zfs_dbgmsgs_lock); @@ -180,6 +109,7 @@ zfs_dbgmsg(const char *fmt, ...) void zfs_dbgmsg_print(const char *tag) { +#if !defined(_KERNEL) zfs_dbgmsg_t *zdm; (void) printf("ZFS_DBGMSG(%s):\n", tag); @@ -188,17 +118,5 @@ zfs_dbgmsg_print(const char *tag) zdm = list_next(&zfs_dbgmsgs, zdm)) (void) printf("%s\n", zdm->zdm_msg); mutex_exit(&zfs_dbgmsgs_lock); +#endif /* !_KERNEL */ } -#endif - -#if defined(_KERNEL) -module_param(zfs_flags, int, 0644); -MODULE_PARM_DESC(zfs_flags, "Set additional debugging flags"); - -module_param(zfs_recover, int, 0644); -MODULE_PARM_DESC(zfs_recover, "Set to attempt to recover from fatal errors"); - -module_param(zfs_free_leak_on_eio, int, 0644); -MODULE_PARM_DESC(zfs_free_leak_on_eio, - "Set to ignore IO errors during free and permanently leak the space"); -#endif /* _KERNEL */ diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 37a893c475d8..9396d6caa9ba 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -247,6 +247,55 @@ static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); +#if defined(HAVE_DECLARE_EVENT_CLASS) +void +__dprintf(const char *file, const char *func, int line, const char *fmt, ...) +{ + const char *newfile; + size_t size = 4096; + char *buf = kmem_alloc(size, KM_PUSHPAGE); + char *nl; + va_list adx; + + /* + * Get rid of annoying prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + va_start(adx, fmt); + (void) vsnprintf(buf, size, fmt, adx); + va_end(adx); + + /* + * Get rid of trailing newline. + */ + nl = strrchr(buf, '\n'); + if (nl != NULL) + *nl = '\0'; + + /* + * To get this data enable the zfs__dprintf trace point as shown: + * + * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer + * $ echo 1 > /sys/module/zfs/parameters/zfs_flags + * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable + * $ echo 0 > /sys/kernel/debug/tracing/trace + * + * # Dump the ring buffer. + * $ cat /sys/kernel/debug/tracing/trace + */ + DTRACE_PROBE4(zfs__dprintf, + char *, newfile, char *, func, int, line, char *, buf); + + kmem_free(buf, size); +} +#endif /* HAVE_DECLARE_EVENT_CLASS */ + static void history_str_free(char *buf) { diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index eea78417a5ba..d67f11eca08b 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -1436,7 +1436,7 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) gen_mask = -1ULL >> (64 - 8 * i); - dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); + dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask); if ((err = zfs_zget(zsb, object, &zp))) { ZFS_EXIT(zsb); return (err); @@ -1447,7 +1447,8 @@ zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { - dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); + dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen, + fid_gen); iput(ZTOI(zp)); ZFS_EXIT(zsb); return (SET_ERROR(EINVAL)); diff --git a/scripts/cstyle.pl b/scripts/cstyle.pl index 083b30f6e8e9..4df185eb38ce 100755 --- a/scripts/cstyle.pl +++ b/scripts/cstyle.pl @@ -597,8 +597,9 @@ ($$) if (/\(\s/) { err("whitespace after left paren"); } - # allow "for" statements to have empty "continue" clauses - if (/\s\)/ && !/^\s*for \([^;]*;[^;]*; \)/) { + # Allow "for" statements to have empty "continue" clauses. + # Allow right paren on its own line unless we're being picky (-p). + if (/\s\)/ && !/^\s*for \([^;]*;[^;]*; \)/ && ($picky || !/^\s*\)/)) { err("whitespace before right paren"); } if (/^\s*\(void\)[^ ]/) { diff --git a/scripts/zfs.sh b/scripts/zfs.sh index b97a0577ff8a..55584ddd1f91 100755 --- a/scripts/zfs.sh +++ b/scripts/zfs.sh @@ -33,7 +33,6 @@ MODULE-OPTIONS: $0 zfs="zfs_prefetch_disable=1" $0 zfs="zfs_prefetch_disable=1 zfs_mdcomp_disable=1" -$0 spl="spl_debug_mask=0" EOF }