Skip to content

Commit

Permalink
Merge pull request #67 from yhoogstrate/chunked_reader__with_state_pa…
Browse files Browse the repository at this point in the history
…ttern

Chunked reader  with state pattern
  • Loading branch information
yhoogstrate authored Jan 22, 2023
2 parents e28ffaf + 97b6682 commit 654ea09
Show file tree
Hide file tree
Showing 42 changed files with 1,669 additions and 889 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,6 @@ perf.*
analysis.txt
*.naf
.kdev4
compile_commands.json
*.fastafs
*.zst
13 changes: 11 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ cmake_minimum_required(VERSION 2.8)

project(fastafs)

# helps debugging:
# Do this once in a while - find different compiler warnings
#set(CMAKE_CXX_COMPILER "clang++")

set(PROJECT_VERSION "1.9.0")
set(PROJECT_VERSION "1.10.0")
set(PACKAGE_URL "https://github.com/yhoogstrate/fastafs")
set(PACKAGE_BUGREPORT "${PACKAGE_URL}/issues")

Expand Down Expand Up @@ -112,7 +113,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")

# -DXXH_NAMESPACE=ZST_
if(DEBUG)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pg -ggdb -Wconversion -D_FILE_OFFSET_BITS=64")# -Werror makes compilation crash when warnings are given (also part of Travis)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pg -ggdb -Wconversion -D_FILE_OFFSET_BITS=64 -g -DBOOST_TEST_TOOLS_UNDER_DEBUGGER -DBOOST_TEST_TOOLS_DEBUGGABLE")# -Werror makes compilation crash when warnings are given (also part of Travis)
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -D_FILE_OFFSET_BITS=64")
endif()
Expand All @@ -128,9 +129,12 @@ add_library(libfastafs SHARED
src/flags.cpp
src/fastafs.cpp
src/ucsc2bit.cpp

src/xbit_byte_encoder.cpp
src/twobit_byte.cpp
src/fourbit_byte.cpp
src/fivebit_fivebytes.cpp

src/database.cpp
src/utils.cpp
src/sequence_region.cpp
Expand Down Expand Up @@ -201,6 +205,11 @@ add_test(test_chunked_reader ${BUILD_TEST_DIR}/test_chunked_reader)
set_target_properties(test_chunked_reader PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${BUILD_TEST_DIR})
target_link_libraries(test_chunked_reader libfastafs)

add_executable(test_database test/database/test_database.cpp)
set_target_properties(test_database PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${BUILD_TEST_DIR})
target_link_libraries(test_database libfastafs)
add_test(test_database ${BUILD_TEST_DIR}/test_database)

add_executable(test_fastafs_as_ucsc2bit test/fastafs/test_ucsc2bit.cpp)
add_test(test_fastafs_as_ucsc2bit ${BUILD_TEST_DIR}/test_fastafs_as_ucsc2bit)
set_target_properties(test_fastafs_as_ucsc2bit PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${BUILD_TEST_DIR})
Expand Down
7 changes: 7 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
2023-01-22
* v.10.0
* Better ninja/meson support
* Code clean-ups
* Restructured the chunked_reader class and subclasses according to
the desgin patterns philisopy

2020-04-29 Youri Hoogstrate

* v1.9.0
Expand Down
3 changes: 1 addition & 2 deletions dependencies/zstd-seekable-adapted/zstdseek_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,15 +175,13 @@ size_t ZSTD_seekable_decompressFile_orDie(ZSTD_seekable_decompress_init_data* fh
size_t written = 0;

if(fh->fin == NULL) {
printf("fin == NULL: YES!!\n");
exit(124);
}
//else {
// printf("[%i] == NULL: no\n",msgid);
//}

if (feof(fh->fin)) {
printf ("!!!! FEOF !!!!! \n");
exit(123);
}
//else {
Expand Down Expand Up @@ -232,6 +230,7 @@ size_t ZSTD_seekable_decompressFile_orDie(ZSTD_seekable_decompress_init_data* fh

//fh->fin_locked = false;


return written;
}

Expand Down
4 changes: 4 additions & 0 deletions fastafs.kdev4
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[Project]
CreatedFrom=CMakeLists.txt
Manager=KDevCMakeManager
Name=fastafs
88 changes: 71 additions & 17 deletions include/chunked_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,49 +19,103 @@
#include "zstd_seekable_utils.hpp"


enum compression_type : signed char {
enum compression_type : signed char { // dit is State
undefined = -1,
uncompressed = 0,
zstd = 1
};


//@todo implement w/ state design pattern
//url: https://refactoring.guru/design-patterns/state


class chunked_reader
class chunked_reader;

class State
{
protected:
chunked_reader *context; // back-reference to context, to access file_i, filename etc.

public:
chunked_reader(char *); // filename
chunked_reader(const char *); // filename
~chunked_reader();
virtual ~State() {};
void set_context(chunked_reader *);
size_t read(unsigned char *, size_t, size_t &, size_t &); // reads from buffer, context a-specific

// virtual functions:
virtual void fopen(off_t) = 0;
virtual size_t cache_buffer() = 0; // formerly update_..._buffer
virtual void seek(off_t) = 0;
}; // compression type



class ContextUncompressed : public State
{
private:
std::ifstream *fh = nullptr;

public:
void fopen(off_t) override;
size_t cache_buffer() override;
void seek(off_t);

~ContextUncompressed() override;
};

class ContextZstdSeekable : public State
{
private:
ZSTD_seekable_decompress_init_data* fh = nullptr;

void init(); // generic tasks needed for init
size_t const buffOutSize = ZSTD_DStreamOutSize();
char* const buffOut = (char*) malloc_orDie(buffOutSize);
ZSTD_seekable* const seekable = ZSTD_seekable_create(); //@todo -> in constructor, check if not NULL

std::string filename; // try doing this with inode
size_t maxFileSize;

std::ifstream *fh_flat;
void update_flat_buffer();
public:
void fopen(off_t) override;
size_t cache_buffer() override;
void seek(off_t);

ZSTD_seekable_decompress_init_data* fh_zstd;
void update_zstd_buffer();
~ContextZstdSeekable() override;
};


compression_type filetype;
class chunked_reader // master chunked_reader
{
protected:
std::string filename;

char buffer[READ_BUFFER_SIZE + 1];

size_t buffer_i;
size_t buffer_n;

off_t file_i;

void set_filetype();
State *state;

size_t read(char *, size_t);// @deprecate
size_t read(unsigned char *, size_t);
unsigned char read();
public:
void TransitionTo(State *); // @todo rename to set_compression_type
chunked_reader(const char *) ;
~chunked_reader();

State* find_state();
const std::type_info& typeid_state();

const std::string& get_filename();
char* get_buffer();

compression_type get_filetype();

void fopen(off_t);
size_t cache_buffer();
size_t read(unsigned char *, size_t);
void seek(off_t);
size_t tell();
//size_t size();
size_t get_file_i();
};


Expand Down
10 changes: 6 additions & 4 deletions include/database.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,18 @@

class database
{
std::string path;
std::string idx;// current default: ~/.local/share/fastafs/
const std::string path;
const std::string idx;// current default: ~/.local/share/fastafs/
//hash_map<string, unsigned int> idx;// "test": path + "/" + tostr(i) + ".fastafs"

public:
database();
database(const std::string &);

const static std::string get_default_dir();

void force_db_exists();
std::string add(char *);
void load();// reads path + "/" + info.txt, only containing N
void list();// 'ls'
std::string get(std::string);
std::string get(char *);
};
11 changes: 6 additions & 5 deletions include/fastafs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ struct ffs2f_init_seq {

std::vector<uint32_t> m_starts;// file position based
std::vector<uint32_t> m_ends;// file position based

const uint32_t filesize;// with padding and newlines [fastafs_seq->fasta_filesize(cache->padding_arg)]

ffs2f_init_seq(const uint32_t padding, size_t n_blocks, size_t m_blocks, const uint32_t n_lines, const uint32_t filesize):
Expand All @@ -46,7 +46,8 @@ struct ffs2f_init {

ffs2f_init(size_t size, uint32_t padding_arg): padding_arg(padding_arg), sequences(size) {}

~ffs2f_init(void) {
~ffs2f_init(void)
{
for(size_t i = 0; i < sequences.size(); i++) {
delete sequences[i];
}
Expand Down Expand Up @@ -78,7 +79,7 @@ class fastafs_seq
uint32_t fasta_filesize(uint32_t padding);
void view_fasta(ffs2f_init_seq*, chunked_reader &fh);

size_t view_sequence_region_size(ffs2f_init_seq*, sequence_region*, std::ifstream *);
size_t view_sequence_region_size(sequence_region*);
uint32_t view_sequence_region(ffs2f_init_seq*, sequence_region*, char *, size_t, off_t, chunked_reader &);
uint32_t view_fasta_chunk(ffs2f_init_seq*, char *, size_t, off_t, chunked_reader &);
template <class T> uint32_t view_fasta_chunk_generalized(ffs2f_init_seq*, char *, size_t, off_t, chunked_reader &);
Expand Down Expand Up @@ -113,7 +114,7 @@ class fastafs
std::string name;
std::string filename;
compression_type filetype;

std::vector<fastafs_seq*> data;
uint32_t crc32f;// crc32 as found in fastafs file

Expand All @@ -126,7 +127,7 @@ class fastafs
void load(std::string);
void view_fasta(ffs2f_init*);

size_t view_sequence_region_size(ffs2f_init*, const char *); // read stuff like "chr1:123-456" into the buffer
size_t view_sequence_region_size(const char *); // read stuff like "chr1:123-456" into the buffer
uint32_t view_sequence_region(ffs2f_init*, const char *, char*, size_t, off_t); // read stuff like "chr1:123-456" into the buffer
uint32_t view_fasta_chunk(ffs2f_init*, char*, size_t, off_t, chunked_reader &);
uint32_t view_fasta_chunk(ffs2f_init*, char*, size_t, off_t);
Expand Down
4 changes: 2 additions & 2 deletions include/fivebit_fivebytes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ class fivebit_fivebytes
static unsigned char iterator_to_offset(unsigned int);
static unsigned char decompressed_to_compressed_bytes(unsigned char); // when only 5/8 bytes are filled, only 4/5 bytes need to be written

static const off_t nucleotides_to_compressed_fileoffset(size_t); // file offset waarna gelezen kan worden
static const off_t nucleotides_to_compressed_offset(size_t);// aantal bytes nodig om zoveel data weg te schrijven
static off_t nucleotides_to_compressed_fileoffset(size_t); // file offset waarna gelezen kan worden
static off_t nucleotides_to_compressed_offset(size_t);// aantal bytes nodig om zoveel data weg te schrijven


void next(chunked_reader &); // update the compressed data and set buffer to decompressed data
Expand Down
51 changes: 17 additions & 34 deletions include/flags.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
#include <array>


const unsigned char FASTAFS_BITFLAG_COMPLETE = 0;
const static unsigned char FASTAFS_BITFLAG_COMPLETE = 0;

const unsigned char FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1 = 0;
const unsigned char FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2 = 1;
// const unsigned char FASTAFS_SEQUENCE_BITFLAG_???? = 2 ; // is reserved
const unsigned char FASTAFS_SEQUENCE_BITFLAG_COMPLETE = 3;
const unsigned char FASTAFS_SEQUENCE_BITFLAG_CIRCULAR = 4;
const static unsigned char FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1 = 0;
const static unsigned char FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2 = 1;
// const static unsigned char FASTAFS_SEQUENCE_BITFLAG_???? = 2 ; // is reserved
const static unsigned char FASTAFS_SEQUENCE_BITFLAG_COMPLETE = 3;
const static unsigned char FASTAFS_SEQUENCE_BITFLAG_CIRCULAR = 4;



Expand All @@ -38,34 +38,29 @@ constexpr std::array<unsigned char, 16> bitmasks = {
};


//#include "utils.hpp"


class twobit_flag
{
private:
std::array<unsigned char, 2> bits; // 00000000 00000000

protected:
twobit_flag();

std::array<unsigned char, 2> bits; // 00000000 00000000

// set by flag
void set_flag(unsigned char, bool);// counting flag from bit 0(!)
bool get_flag(unsigned char);

public:
void set(char *);
std::array<unsigned char, 2> &get_bits(void); // get bit 0 or bit 1
void set(unsigned char *);
std::array<unsigned char, 2> &get_bits(void); // get bit 0 or bit 1 - needed for exporting flags to file(s)
};


class fastafs_flags : public twobit_flag
{
public:
bool is_complete();
bool is_incomplete()
{
return !this->is_complete();
};
bool is_incomplete();

void set_complete();
void set_incomplete();
Expand All @@ -83,25 +78,13 @@ class fastafs_sequence_flags : public twobit_flag
bool is_protein(); // alphabet: 'ABCDEFGHIJKLMNOPQRSTUVWYZX*-'

bool is_complete();
bool is_incomplete()
{
return !this->is_complete();
};
bool is_incomplete(); // is not complete

bool is_circular();
bool is_linear()
{
return !this->is_circular();
};

bool is_twobit()
{
return (this->is_dna() | this->is_rna());
};
bool is_fourbit()
{
return this->is_iupec_nucleotide();
};
bool is_linear(); // is not circular

bool is_twobit();
bool is_fourbit();


// set by entity
Expand Down
Loading

0 comments on commit 654ea09

Please sign in to comment.