Skip to content

Commit

Permalink
Merge pull request #47 from yhoogstrate/alphabet_extension
Browse files Browse the repository at this point in the history
v1.7.0: large and feature rich update
  • Loading branch information
yhoogstrate authored Jan 29, 2020
2 parents a3e818d + 3607e5c commit c52232b
Show file tree
Hide file tree
Showing 48 changed files with 4,125 additions and 293 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,9 @@ repeats.txt
build/
xcheck.sh
*.fa.fai
*.o
/bin-meson
/build-meson
*.ninja
.ninja*
test-mount.sh
53 changes: 38 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ project(fastafs)
# Do this once in a while - find different bugs
#set(CMAKE_CXX_COMPILER "clang++")

set(PROJECT_VERSION "1.6.2")
set(PROJECT_VERSION "1.7.0")
set(PACKAGE_URL "https://github.com/yhoogstrate/fastafs")
set(PACKAGE_BUGREPORT "${PACKAGE_URL}/issues")

Expand Down Expand Up @@ -41,7 +41,11 @@ else()
set(DEBUG "false")
endif()

configure_file("include/config.hpp.in" "include/config.hpp")
configure_file("include/config.hpp.in" "include/config.hpp")# implies building is done from project root
#configure_file("include/config.hpp.in" "${CMAKE_CURRENT_BINARY_DIR}/config.hpp")
#configure_file("include/config.hpp.in" "${CMAKE_CURRENT_SOURCE_DIR}/config.hpp")
configure_file("include/config.hpp.in" "${BUILD_DIR}/include/config.hpp")


# ----------------------------------------------------------------------
# ------------------------------ Styling -------------------------------
Expand Down Expand Up @@ -75,7 +79,9 @@ add_custom_target(tidy DEPENDS make_tidy )

add_subdirectory(src)
include_directories(include)
add_definitions(-std=c++17)
#include_directories(${BUILD_DIR})
include_directories("${BUILD_DIR}/include")
add_definitions(-std=c++14)

# Boost
find_package(Boost COMPONENTS unit_test_framework REQUIRED)
Expand All @@ -87,9 +93,11 @@ else()
include_directories(${Boost_INCLUDE_DIRS})
endif()


link_libraries(ssl)
link_libraries(crypto)
link_libraries(fuse)
link_libraries(z)# zlib; -lz; for crc32 checks on whole file integrity


if(DEBUG)
Expand All @@ -100,13 +108,17 @@ endif()

add_executable(fastafs
src/main.cpp
src/fasta_to_fastafs.cpp
src/fasta_to_twobit_fastafs.cpp
src/fasta_to_fourbit_fastafs.cpp
src/ucsc2bit_to_fastafs.cpp
src/flags.cpp
src/fastafs.cpp
src/ucsc2bit.cpp
src/twobit_byte.cpp
src/fourbit_byte.cpp
src/database.cpp
src/utils.cpp
src/sequence_region.cpp
src/fuse.cpp
src/lsfastafs.cpp
)
Expand All @@ -115,24 +127,29 @@ set_target_properties(fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}"
# mount-only binary, without all the other stuff 'mount.fastafs' [for fstab]
add_executable(mount.fastafs
src/main_mount.cpp
src/fasta_to_fastafs.cpp
src/fasta_to_twobit_fastafs.cpp
src/ucsc2bit_to_fastafs.cpp
src/flags.cpp
src/fastafs.cpp
src/ucsc2bit.cpp
src/twobit_byte.cpp
src/fourbit_byte.cpp
src/database.cpp
src/utils.cpp
src/sequence_region.cpp
src/fuse.cpp
src/lsfastafs.cpp
)
set_target_properties(mount.fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}")

add_library(libfastafs SHARED
src/fasta_to_fastafs.cpp
src/fasta_to_twobit_fastafs.cpp
src/ucsc2bit_to_fastafs.cpp
src/flags.cpp
src/fastafs.cpp
src/ucsc2bit.cpp
src/twobit_byte.cpp
src/fourbit_byte.cpp
src/database.cpp
src/utils.cpp
src/fuse.cpp
Expand All @@ -146,11 +163,11 @@ set_target_properties(libfastafs PROPERTIES VERSION ${PROJECT_VERSION})
set_target_properties(libfastafs PROPERTIES SOVERSION 1)
set_target_properties(libfastafs PROPERTIES OUTPUT_NAME fastafs)

#set_target_properties(libfastafs PROPERTIES HEADER_OUTPUT_DIRECTORY "include")
# great, this doesn't go automagically with an entire dir
set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER "include/config.hpp;include/database.hpp;include/fastafs.hpp;include/fasta_to_fastafs.hpp;include/fuse.hpp;include/meson.build;include/twobit_byte.hpp;include/ucsc2bit.hpp;include/ucsc2bit_to_fastafs.hpp;include/utils.hpp")
#set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_DIRECTORY include)
#set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_OUTPUT_DIRECTORY "include")
##set_target_properties(libfastafs PROPERTIES HEADER_OUTPUT_DIRECTORY "include")
## great, this doesn't go automagically with an entire dir
set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER "include/config.hpp;include/database.hpp;include/fastafs.hpp;include/fasta_to_fourbit_fastafs.hpp;include/fasta_to_twobit_fastafs.hpp;include/flags.hpp;include/fourbit_byte.hpp;include/fuse.hpp;include/lsfastafs.hpp;include/sequence_region.hpp;include/twobit_byte.hpp;include/ucsc2bit.hpp;include/ucsc2bit_to_fastafs.hpp;include/utils.hpp")
##set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_DIRECTORY include)
##set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_OUTPUT_DIRECTORY "include")

# ----------------------------------------------------------------------
# ------------------------------ Testing -------------------------------
Expand All @@ -161,16 +178,20 @@ enable_testing()

add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) # 'make check' as alias for 'make test'

add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte")
add_test(test_cache "${BUILD_TEST_DIR}/test_cache")
add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") # ACTG(N) | ACUG(N)
add_test(test_fourbit_byte "${BUILD_TEST_DIR}/test_fourbit_byte") # ACGTURYKMSWBDHVN(-)
add_test(test_cache_twobit "${BUILD_TEST_DIR}/test_cache_twobit")
add_test(test_cache_fourbit "${BUILD_TEST_DIR}/test_cache_fourbit")
add_test(test_view "${BUILD_TEST_DIR}/test_view")
#add_test(test_tree "${BUILD_TEST_DIR}/test_tree")
add_test(test_flags "${BUILD_TEST_DIR}/test_flags")
add_test(test_fastafs "${BUILD_TEST_DIR}/test_fastafs")
add_test(test_check "${BUILD_TEST_DIR}/test_check") # file integrity checks
add_test(test_fastafs_as_ucsc2bit "${BUILD_TEST_DIR}/test_fastafs_as_ucsc2bit")
add_test(test_ucsc2bit_to_fastafs "${BUILD_TEST_DIR}/test_ucsc2bit_to_fastafs")
add_test(test_ucsc2bit_as_fasta "${BUILD_TEST_DIR}/test_ucsc2bit_as_fasta")
add_test(test_sequenceregion "${BUILD_TEST_DIR}/test_sequenceregion")
add_test(test_utils "${BUILD_TEST_DIR}/test_utils")

#add_test(test_tree "${BUILD_TEST_DIR}/test_tree")

#find_program(CTEST_MEMORYCHECK_COMMAND NAMES valgrind) # 'ctest -T memcheck'
#INCLUDE(Dart)
Expand All @@ -181,6 +202,8 @@ add_test(test_utils "${BUILD_TEST_DIR}/test_utils")
# The compiled binary, usually to: /usr/local/bin/fastafs
install(TARGETS fastafs DESTINATION "bin")
install(TARGETS mount.fastafs DESTINATION "bin")

# don't build during debug at least
install(TARGETS libfastafs LIBRARY DESTINATION "lib" PUBLIC_HEADER DESTINATION "include/libfastafs")

# ----------------------------------------------------------------------
20 changes: 20 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,23 @@
2020-01-27 Youri Hoogstrate

* v1.7.0
* `fastafs cache -o` for custom output files and bypassing the config
* Random access subsequence retrieval diretly via filesystem: `<mount>/seq/chr1:100-200`
* Implements CRC32 checksums for whole-file integritity
* Converting to meson because of insane build times using cmake+make and re-building files that have not changed
* `fastafs view|mount -m/--no-masking` virtualises fasta files without masking (uppercase)
* Minor support for building with meson and ninja
* cmake template allows building for guix (+guix file provided)
* Changed requirement from c++17 on c++14 to avoid large compatibility issues
* Implements bitflags with corresponding class
* Implements fourbit (and automatically switches over if non ACTGUN chars are found
* Implements functions `is_fasta_file`, and `is_ucsc2bit_file` using file MAGIC
* Creates by FASTAFS files that are first flagged as incomplete, that are unflagged after conversion has completed
* MD5sums working for fourbit compressed sequences
* Implements `fastafs cache -o` to export to desired output fastafs file
* Adds compression type to `fastafs list` output
* More and improved testing, including file integrity detection

2019-09-06 Youri Hoogstrate

* v1.6.2
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# fastafs: fuse layer and file system for storing FASTA files
# FASTAFS: toolkit for file system virtualisation of random access compressed FASTA files

----

Expand Down Expand Up @@ -27,7 +27,7 @@ Required dependencies are:
- libboost (only for unit testing, will be come an optional dependency soon)
- libopenssl (for generating MD5 hashes)
- libfuse (for access to the fuse layer system and file virtualization)
- c++ compiler supporting c++-17
- c++ compiler supporting c++-14

Compilation is done using cmake. The build command to run cmake for common use is:

Expand Down
1 change: 0 additions & 1 deletion bin/.gitignore

This file was deleted.

8 changes: 7 additions & 1 deletion build-debug.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
#!/bin/bash

#cmake -GNinja -DCMAKE_BUILD_TYPE=debug -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON .
#ninja -j`nproc`
#ninja install


## using make - sometimes much slower
cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON .
make "$@" -j `nproc`
make "$@" -j $(nproc)
make install
5 changes: 5 additions & 0 deletions build-release-meson.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

meson bin-meson
cd bin-meson
ninja
6 changes: 4 additions & 2 deletions build-release.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

#cmake -GNinja -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON .

cmake -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON .
make "$@" -j `nproc`
sudo make install
make "$@" -j $(nproc)
make install
18 changes: 18 additions & 0 deletions doc/4bit.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
A 0000
B 0001
C 0010
D 0011
G 0100
H 0101
K 0110
M 0111
N 1000
R 1001
S 1010
T 1011
U 1100
V 1101
W 1110
Y 1111

- by idx
24 changes: 17 additions & 7 deletions doc/FASTAFS-FORMAT-SPECIFICATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ If this metadata would be written in the header located before the sequence data
| GENERIC-HEADER | | | |
| | [MAGIC](#magic) | 4 bytes | `x0F x0A x46 x53` |
| | [FILE FORMAT VERSION](#file-format-version) | [4-byte integer](#four-byte-integer) | `x00 x00 x00 x00` |
| | [FASTAFS-FLAG](#fastafs-flag) | 2 bytes | Certain binary flags |
| | [FASTAFS-FLAGS](#fastafs-flags) | 2 bytes | Certain binary flags |
| | [FILE-POSITION-OF-INDEX](#file-position-of-the-index) | [4-byte integer](#four-byte-integer) | Location in the file (offset in bytes from beginning) where the INDEX is located |
| DATA | --- | --- | --- |
| -> per sequence |
Expand All @@ -40,7 +40,7 @@ If this metadata would be written in the header located before the sequence data
| INDEX | --- | --- | |
| | NUMBER-SEQUENCES | uint32_t as [4-byte integer](#four-byte-integer) | Number of sequences included |
| -> per sequence |
| | [SEQUENCE-FLAG](#sequence-flag) | 2 bytes | storing metadata and type of data |
| | [SEQUENCE-FLAGS](#sequence-flags) | 2 bytes | storing metadata and type of data |
| | NAME-LENGTH | 1 byte as unsigned char | length in bytes; name cannot exceed 255 bytes |
| | NAME-FASTA | NAME-LENGTH x char | FASTA header; may not include new-lines or '>' |
| | START-POSITION-IN-BODY of N-COMPR-NUC | uint32_t as [4-byte integer](#four-byte-integer) | Location in the file (offset in bytes from beginning) where the DATA block for this sequence starts |
Expand All @@ -50,7 +50,7 @@ If this metadata would be written in the header located before the sequence data
| | METADATA-TYPE-FLAG | 2 bytes |
| | ENTRY | type specific, examples below: |
| | => ORIGINAL PADDING | uint32_t as [4-byte integer](#four-byte-integer) | The number of nucleotides per line in the original FASTA file |

| CRC32 | Checksum on entire file | 4 bytes | To ensure whole file integrity |

### GENERIC-HEADER ###

Expand Down Expand Up @@ -80,7 +80,7 @@ The bit representation of these bytes are:
+--------+--------+--------+--------+
```

#### FASTAFS-FLAG ####
#### FASTAFS-FLAGS ####

```
bit 0 file-complete
Expand Down Expand Up @@ -115,13 +115,23 @@ The index is located at the end of the data. This file offset in bytes from the

Repeated for every sequence, in order matching SEQUENCE-HEADER

#### SEQUENCE-FLAG ####
#### SEQUENCE-FLAGS ####

The sequence flag allows to describe the following metadata for each sequence:

```
bit 0 is-rna [1 = yes, 0 = DNA]
bit 1 reserved [reserved, library type 2 -> protein]
bit 0 combined sequence type
bit 1 combined sequence type
```

| bit-0 | bit-1 | Type | Alphabet |
| ---- | ---- | - | - |
| `0` | `0` | DNA | `ACTG` + `N` |
| `1` | `0` | RNA | `ACUG` + `N` |
| `0` | `1` | IUPEC Nucleotide | `ACGTURYKMSWBDHVN` + `-` |
| `1` | `1` | reserved for protein | to be determined |

```
bit 2 reserved [reserved, library type 2 -> protein]
bit 3 is-complete [1: checksum is present, 0: some regions are reserved but not yet 'downloaded']
bit 4 is-circular
Expand Down
40 changes: 40 additions & 0 deletions guix.scm
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
;; guix package --install-from-file=/home/youri/src/fastafs/guix.scm
;; https://guix.gnu.org/blog/2018/a-packaging-tutorial-for-guix/

(use-modules (guix packages)
(guix download)
(guix git-download)
(guix build-system gnu)
(guix build-system cmake)
(guix licenses)
(gnu packages boost)
(gnu packages compression)
(gnu packages tls)
(gnu packages linux))

(package
(name "fastafs")
(version "1.7.0")
(source (origin
(method url-fetch)
; (uri (string-append "https://github.com/yhoogstrate/fastafs/archive/a39eddbf810d7a828d33d6dbe8c913bbffd58948.tar.gz"))
(uri (string-append "file:///home/youri/.local/src/fastafs.tar.gz"))
(sha256
(base32
"1njzvaxy1nq4202ispphyxddihq1x1cmfzbl8zmkqiwa028k540c"))))
(build-system cmake-build-system)
(arguments
`(#:build-type "debug"
#:tests? #f) ; skip tests that fail because test data is not in build path
)
(inputs
`(("boost" ,boost)
("zlib" ,zlib)
("openssl" ,openssl)
("fuse" ,fuse)
))
(synopsis "fastafs")
(description
"fastafs: toolkit for file system virtualisation of random access compressed FASTA, FAI, DICT & TWOBIT files")
(home-page "https://github.com/yhoogstrate/fastafs")
(license gpl2+))
4 changes: 4 additions & 0 deletions include/config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,8 @@ static const std::string DICT_HEADER = "@HD\tVN:1.0\tSO:unsorted\n";
static const std::string FASTAFS_FILE_XATTR_NAME = "fastafs-file";
static const std::string FASTAFS_PID_XATTR_NAME = "fastafs-pid";


static const size_t MAX_SIZE_SEQ_NAME = 255;


#endif
Loading

0 comments on commit c52232b

Please sign in to comment.