diff --git a/lib/deflate_compress.c b/lib/deflate_compress.c
index 800398df..b9fe042f 100644
--- a/lib/deflate_compress.c
+++ b/lib/deflate_compress.c
@@ -31,84 +31,127 @@
 
 #include "libdeflate.h"
 
+/******************************************************************************/
+
 /*
- * By default, the near-optimal parsing algorithm is enabled at compression
- * level 10 and above.  The near-optimal parsing algorithm produces a
- * compression ratio significantly better than the greedy and lazy algorithms
- * implemented here, and also the algorithm used by zlib at level 9.  However,
- * it is slow.
+ * The following parameters can be changed at build time to customize the
+ * compression algorithms slightly:
+ *
+ * (Note, not all customizable parameters are here.  Some others can be found in
+ * libdeflate_alloc_compressor() and in *_matchfinder.h.)
  */
-#define SUPPORT_NEAR_OPTIMAL_PARSING 1
 
 /*
- * Define to 1 to maintain the full map from match offsets to offset slots.
- * This slightly speeds up translations of match offsets to offset slots, but it
- * uses 32769 bytes of memory rather than the 512 bytes used by the condensed
- * map.  The speedup provided by the larger map is most helpful when the
- * near-optimal parsing algorithm is being used.
+ * If this parameter is defined to 1, then the near-optimal parsing algorithm
+ * will be included, and compression levels 10-12 will use it.  This algorithm
+ * usually produces a compression ratio significantly better than the other
+ * algorithms.  However, it is slow.  If this parameter is defined to 0, then
+ * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm.
  */
-#define USE_FULL_OFFSET_SLOT_FAST	SUPPORT_NEAR_OPTIMAL_PARSING
+#define SUPPORT_NEAR_OPTIMAL_PARSING	1
 
-/* Include the needed matchfinders. */
-#define MATCHFINDER_WINDOW_ORDER	DEFLATE_WINDOW_ORDER
-#include "hc_matchfinder.h"
-#if SUPPORT_NEAR_OPTIMAL_PARSING
-#  include "bt_matchfinder.h"
-#endif
+/*
+ * If this parameter is defined to 1, then the compressor will maintain a full
+ * map from match offsets to offset slots, rather than a condensed map.  This
+ * will usually improve performance, especially for the near-optimal parsing
+ * algorithm.  However, it will use an additional 32257 bytes of memory.
+ */
+#define USE_FULL_OFFSET_SLOT_FAST	SUPPORT_NEAR_OPTIMAL_PARSING
 
 /*
- * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
- * except if the last block has to be shorter.
+ * This is the minimum block length, in uncompressed bytes, which the compressor
+ * will use.  This should be a value below which using shorter blocks is very
+ * unlikely to be worthwhile, due to the per-block overhead.  This parameter
+ * doesn't apply to the final block, which can be arbitrarily short.
+ *
+ * Defining a fixed minimum block length is needed in order to guarantee a
+ * reasonable upper bound on the compressed size.  It's also needed because our
+ * block splitting algorithm doesn't work well on very short blocks.
  */
 #define MIN_BLOCK_LENGTH	10000
 
 /*
- * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
- * the final length might be slightly longer due to matches extending beyond
- * this limit.
+ * This is the soft maximum block length, in uncompressed bytes, which the
+ * compressor will use.  This is a "soft" maximum, meaning that the compressor
+ * will try to end blocks at this length, but it may go slightly past it if
+ * there is a match that straddles this limit.  This parameter doesn't apply to
+ * uncompressed blocks, which the DEFLATE format limits to 65535 bytes.
+ *
+ * This should be a value above which it is very likely that splitting the block
+ * would produce a better compression ratio.  Increasing/decreasing this
+ * parameter will increase/decrease per-compressor memory usage linearly.
  */
 #define SOFT_MAX_BLOCK_LENGTH	300000
 
 /*
- * The number of observed matches or literals that represents sufficient data to
- * decide whether the current block should be terminated or not.
+ * These are the maximum codeword lengths, in bits, the compressor will use for
+ * each Huffman code.  The DEFLATE format defines limits for these.  However,
+ * further limiting litlen codewords to 14 bits is beneficial, since it has
+ * negligible effect on compression ratio but allows some optimizations when
+ * outputting bits.  (It allows 4 literals to be written at once rather than 3.)
  */
-#define NUM_OBSERVATIONS_PER_BLOCK_CHECK       512
-
+#define MAX_LITLEN_CODEWORD_LEN		14
+#define MAX_OFFSET_CODEWORD_LEN		DEFLATE_MAX_OFFSET_CODEWORD_LEN
+#define MAX_PRE_CODEWORD_LEN		DEFLATE_MAX_PRE_CODEWORD_LEN
 
 #if SUPPORT_NEAR_OPTIMAL_PARSING
-/* Constants specific to the near-optimal parsing algorithm */
+
+/* Parameters specific to the near-optimal parsing algorithm */
+
+/*
+ * BIT_COST is a scaling factor that allows the compressor to consider
+ * fractional bit costs when deciding which literal/match sequence to use.  This
+ * is useful when the true symbol costs are unknown.  For example, if the
+ * compressor thinks that a symbol has 6.5 bits of entropy, it can set its cost
+ * to 6.5 bits rather than have to use 6 or 7 bits.  Although in the end each
+ * symbol will use a whole number of bits due to the Huffman coding, considering
+ * fractional bits can be helpful due to the limited information.
+ *
+ * BIT_COST should be a power of 2.  A value of 8 or 16 works well.  A higher
+ * value isn't very useful since the calculations are approximate anyway.
+ */
+#define BIT_COST	8
 
 /*
- * The maximum number of matches the matchfinder can find at a single position.
- * Since the matchfinder never finds more than one match for the same length,
- * presuming one of each possible length is sufficient for an upper bound.
- * (This says nothing about whether it is worthwhile to consider so many
- * matches; this is just defining the worst case.)
+ * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
+ * be needed to output a symbol that was unused in the previous optimization
+ * pass.  Assigning a default cost allows the symbol to be used in the next
+ * optimization pass.  However, the cost should be relatively high because the
+ * symbol probably won't be used very many times (if at all).
  */
-#  define MAX_MATCHES_PER_POS	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
+#define LITERAL_NOSTAT_BITS	13
+#define LENGTH_NOSTAT_BITS	13
+#define OFFSET_NOSTAT_BITS	10
 
 /*
- * The number of lz_match structures in the match cache, excluding the extra
- * "overflow" entries.  This value should be high enough so that nearly the
- * time, all matches found in a given block can fit in the match cache.
- * However, fallback behavior (immediately terminating the block) on cache
- * overflow is still required.
+ * This is (approximately) the maximum number of matches that the compressor
+ * will cache per block.  If the match cache becomes full, then the compressor
+ * will be forced to end the block early.  This value should be large enough so
+ * that this rarely happens, due to the block being ended normally before the
+ * cache fills up.  Increasing/decreasing this parameter will increase/decrease
+ * per-compressor memory usage linearly.
  */
-#  define MATCH_CACHE_LENGTH      (SOFT_MAX_BLOCK_LENGTH * 5)
+#define MATCH_CACHE_LENGTH	(SOFT_MAX_BLOCK_LENGTH * 5)
 
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
+/******************************************************************************/
+
+/* Include the needed matchfinders. */
+#define MATCHFINDER_WINDOW_ORDER	DEFLATE_WINDOW_ORDER
+#include "hc_matchfinder.h"
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+#  include "bt_matchfinder.h"
 /*
- * These are the compressor-side limits on the codeword lengths for each Huffman
- * code.  To make outputting bits slightly faster, some of these limits are
- * lower than the limits defined by the DEFLATE format.  This does not
- * significantly affect the compression ratio, at least for the block lengths we
- * use.
+ * This is the maximum number of matches the binary trees matchfinder can find
+ * at a single position.  Since the matchfinder never finds more than one match
+ * for the same length, presuming one of each possible length is sufficient for
+ * an upper bound.  (This says nothing about whether it is worthwhile to
+ * consider so many matches; this is just defining the worst case.)
  */
-#define MAX_LITLEN_CODEWORD_LEN		14
-#define MAX_OFFSET_CODEWORD_LEN		DEFLATE_MAX_OFFSET_CODEWORD_LEN
-#define MAX_PRE_CODEWORD_LEN		DEFLATE_MAX_PRE_CODEWORD_LEN
+#define MAX_MATCHES_PER_POS	\
+	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
+#endif
 
 /* Table: length slot => length slot base value  */
 static const unsigned deflate_length_slot_base[] = {
@@ -191,48 +234,6 @@ struct deflate_freqs {
 	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
 };
 
-#if SUPPORT_NEAR_OPTIMAL_PARSING
-
-/* Costs for the near-optimal parsing algorithm.  */
-struct deflate_costs {
-
-	/* The cost to output each possible literal.  */
-	u32 literal[DEFLATE_NUM_LITERALS];
-
-	/* The cost to output each possible match length.  */
-	u32 length[DEFLATE_MAX_MATCH_LEN + 1];
-
-	/* The cost to output a match offset of each possible offset slot.  */
-	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
-};
-
-/*
- * BIT_COST is a scaling factor that allows the compressor to consider
- * fractional bit costs when deciding which literal/match sequence to use.  This
- * is useful when the true symbol costs are unknown.  For example, if the
- * compressor thinks that a symbol has 6.5 bits of entropy, it can set its cost
- * to 6.5 bits rather than have to use 6 or 7 bits.  Although in the end each
- * symbol will use a whole number of bits due to the Huffman coding, considering
- * fractional bits can be helpful due to the limited information.
- *
- * BIT_COST should be a power of 2.  A value of 8 or 16 works well.  A higher
- * value isn't very useful since the calculations are approximate anyway.
- */
-#define BIT_COST	8
-
-/*
- * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
- * be needed to output a symbol that was unused in the previous optimization
- * pass.  Assigning a default cost allows the symbol to be used in the next
- * optimization pass.  However, the cost should be relatively high because the
- * symbol probably won't be used very many times (if at all).
- */
-#define LITERAL_NOSTAT_BITS	13
-#define LENGTH_NOSTAT_BITS	13
-#define OFFSET_NOSTAT_BITS	10
-
-#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
-
 /*
  * Represents a run of literals followed by a match or end-of-block.  This
  * struct is needed to temporarily store items chosen by the parser, since items
@@ -266,6 +267,19 @@ struct deflate_sequence {
 
 #if SUPPORT_NEAR_OPTIMAL_PARSING
 
+/* Costs for the near-optimal parsing algorithm.  */
+struct deflate_costs {
+
+	/* The cost to output each possible literal.  */
+	u32 literal[DEFLATE_NUM_LITERALS];
+
+	/* The cost to output each possible match length.  */
+	u32 length[DEFLATE_MAX_MATCH_LEN + 1];
+
+	/* The cost to output a match offset of each possible offset slot.  */
+	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
+};
+
 /*
  * This structure represents a byte position in the input data and a node in the
  * graph of possible match/literal choices for the current block.
@@ -311,6 +325,7 @@ struct deflate_optimum_node {
 #define NUM_LITERAL_OBSERVATION_TYPES 8
 #define NUM_MATCH_OBSERVATION_TYPES 2
 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
+#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
 struct block_split_stats {
 	u32 new_observations[NUM_OBSERVATION_TYPES];
 	u32 observations[NUM_OBSERVATION_TYPES];