diff --git a/.github/workflows/serve_mkdocs.yml b/.github/workflows/serve_mkdocs.yml new file mode 100644 index 0000000000..865169498f --- /dev/null +++ b/.github/workflows/serve_mkdocs.yml @@ -0,0 +1,30 @@ +# Adapted from https://squidfunk.github.io/mkdocs-material/publishing-your-site/ +name: Serve MkDocs +on: + push: + branches: + - master +permissions: + contents: write +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Configure Git Credentials + run: | + git config user.name github-actions[bot] + git config user.email 41898282+github-actions[bot]@users.noreply.github.com + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV + - uses: actions/cache@v4 + with: + key: mkdocs-material-${{ env.cache_id }} + path: .cache + restore-keys: | + mkdocs-material- + - run: pip install mkdocs-material + - run: pip install -e mkdocs/mkdocs-toc-tag-filter + - run: mkdocs gh-deploy --force \ No newline at end of file diff --git a/docs/all.code_organization.reference.md b/docs/all.code_organization.reference.md index bb188c093b..fdf7b4532d 100644 --- a/docs/all.code_organization.reference.md +++ b/docs/all.code_organization.reference.md @@ -1,4 +1,4 @@ - +# Code Organization @@ -23,9 +23,9 @@ -# Code organization of `amp` +## Code organization of `amp` -## Conventions +### Conventions - In this code organization files we use the following conventions: - Comments: `"""foobar is ..."""` @@ -64,14 +64,14 @@ - Needs to go in `//lime/dataflow_lime/system` - Can be called `eg_historical_data_source.py` -## Finding deps +### Finding deps -### Using `invoke find_dependency` +#### Using `invoke find_dependency` ``` > i find_dependency --module-name "amp.dataflow.model" --mode "find_lev2_deps" --ignore-helpers --only-module dataflow ``` -### Using grep +#### Using grep - To check for dependencies between one module (e.g., `dataflow/model`) and another (e.g., `dataflow/system`): @@ -79,7 +79,7 @@ > (cd dataflow/model/; jackpy "import ") | grep -v notebooks | grep -v test | grep -v __init__ | grep "import dataflow.system" | sort ``` -### Using Pydeps +#### Using Pydeps - Install ``` @@ -97,7 +97,7 @@ > helpers.telegram_notify -vv --show-dot -o deps.html --max-bacon 2 --reverse ``` -## Component dirs +### Component dirs Useful utilities are: ``` @@ -113,12 +113,12 @@ im_v2/ccxt/data/ 4 directories, 1 file ``` -### `helpers/` +#### `helpers/` - `helpers/` - """Low-level helpers that are general and not specific of this project""" -### `core/` +#### `core/` - `core/` - """Low-level helpers that are specific of this project""" @@ -132,17 +132,17 @@ im_v2/ccxt/data/ - `signal_processing.py` - `statistics.py` -### `devops/` +#### `devops/` - `devops/` - TODO(gp): reorg -### `dev_scripts/` +#### `dev_scripts/` - `/dev_scripts` - TODO(gp): reorg -### `im/` +#### `im/` - `sorrentum_sandbox/` - `common/` @@ -203,7 +203,7 @@ im_v2/ccxt/data/ - `kibot/` - `mock1/` -### `market_data/` +#### `market_data/` - `market_data/` - """Interface to read price data""" @@ -213,7 +213,7 @@ im_v2/ccxt/data/ - `ReplayedMarketData` - TODO(gp): Move market_data to `datapull/market_data` -### `dataflow/` +#### `dataflow/` - `dataflow/` - """DataFlow module""" @@ -283,7 +283,7 @@ im_v2/ccxt/data/ - `/research_amp` -## dataflow dependencies +### dataflow dependencies - `dataflow/core` - Should not depend on anything in `dataflow` @@ -303,7 +303,7 @@ im_v2/ccxt/data/ - TODO(gp): Move backtest up -## Top level dirs +### Top level dirs ```text (cd amp; tree -L 1 -d --charset=ascii -I "*test*|*notebooks*" 2>&1 | tee /tmp/tmp) @@ -320,7 +320,7 @@ im_v2/ccxt/data/ `-- research_amp ``` -# Invariants +## Invariants - We assume that there is no file with the same name either in the same repo or across different repos @@ -342,7 +342,7 @@ im_v2/ccxt/data/ - Note that this rule makes the naming of files depending on the history, but it minimizes churn of names -# Misc +## Misc - To execute a vim command, go on the line diff --git a/docs/all.documentation_meta.reference.md b/docs/all.documentation_meta.reference.md new file mode 100644 index 0000000000..27c87607bf --- /dev/null +++ b/docs/all.documentation_meta.reference.md @@ -0,0 +1,178 @@ +# Documentation Meta + + + +- [How to organize the docs](#how-to-organize-the-docs) + * [Dir vs no-dirs](#dir-vs-no-dirs) + * [Tracking reviews and improvements](#tracking-reviews-and-improvements) + * [How to search the documentation](#how-to-search-the-documentation) + * [Ensure that all the docs are cross-referenced in the indices](#ensure-that-all-the-docs-are-cross-referenced-in-the-indices) +- [List of files](#list-of-files) + * [Description](#description) + + + +## How to organize the docs + +- Documentation can be organized in multiple ways: + - By software component + - By functionality (e.g., infra, backtesting) + - By team (e.g., trading ops) + +- We have decided that + - For each software component there should be a corresponding documentation + - We have documentation for each functionality and team + +- Processes + - `onboarding` + - `general_background` + - `work_organization` + - `work_tools` + - `coding` + - ... + +- Software components + - `build` + - `kaizenflow` + - `datapull` + - `dataflow` + - `trade_execution` + - `infra` + - ... + +### Dir vs no-dirs + +- Directories make it difficult to navigate the docs +- We use “name spaces” until we have enough objects to create a dir + +### Tracking reviews and improvements + +- Doc needs to be reviewed "actively", e.g., by making sure someone checks them + in the field +- Somebody should verify that is "executable" + +- There is a + [Master Documentation Gdoc](https://docs.google.com/document/d/1sEG5vGkaNIuMEkCHgkpENTUYxDgw1kZXb92vCw53hO4) + that contains a list of tasks related to documentation, including what needs + to be reviewed + +- For small action items we add a markdown TODO like we do for the code + ``` + + ``` + +- To track the last revision we use a tag at the end of the document like: + ```markdown + Last review: GP on 2024-04-20, ... + ``` + +### How to search the documentation + +- Be patient and assume that the documentation is there, but you can't find it + because you are not familiar with it and not because you think the + documentation is poorly done or not organized + +- Look for files that contain words related to what you are looking for + - E.g., `ffind.py XYZ` +- Grep in the documentation looking for words related to what you are looking + for + - E.g., `jackmd trading` +- Scan through the content of the references + - E.g., `all.code_organization.reference.md` +- Grep for the name of a tool in the documentation + +### Ensure that all the docs are cross-referenced in the indices + +- There is a script to check and update the documentation cross-referencing + files in a directory and a file with all the links to the files + ``` + /Users/saggese/src/dev_tools1/linters/amp_fix_md_links.py + docs/all.amp_fix_md_links.explanation.md + ``` + +## List of files + +- The current structure of files is given by: + + ```bash + > tree docs -I '*figs*|test*' --dirsfirst -n -F --charset unicode | grep -v __init__.py + ``` + +- The simple list is: + ```bash + > ls -1 docs + all.code_organization.reference.md + all.documentation_meta.reference.md + all.software_components.reference.md + all.workflow.explanation.md + build + ck.components.reference.md + coding + dash_web_apps + dataflow + datapull + deploying + dev_tools + documentation_meta + general_background + infra + kaizenflow + marketing + monitoring + oms + onboarding + trading_ops + work_organization + work_tools + ``` + +### Description + +- Please keep the directory in alphabetical order + +- `all.documentation_meta.reference.md`: contains rules and conventions for all + the documentation under `docs` +- `all.code_organization.reference.md`: describes how the code is organized in + terms of components, libraries, and directories +- `all.software_components.reference.md`: lists all the software components in + the codebase +- `all.workflow.explanation.md`: describes all the workflows for quants, quant + devs, and devops +- `build`: information related to the build system and GitHub actions +- `ck.components.reference.md`: list software components and maintainers +- `coding` + - Guidelines and good practices for coding and code-adjacent activities (such + as code review) + - This includes general tips and tricks that are useful for anybody writing + any code (e.g., how to use type hints) as well as in-depth descriptions of + specific functions and libraries +- `dash_web_apps` +- `dataflow`: docs related to the framework of implementing and running machine + learning models +- `datapull`: docs related to dataset handling: downloading, onboarding, + interpretation, etc. +- `deploying` +- `dev_tools` +- `documentation_meta`: how to write documentation for code and workflows +- `general_background`: documents that provide general reference information, + often across different topics + - E.g., glossaries, reading lists +- `infra`: docs related to the company’s infrastructure + - E.g., AWS services, code deployment, monitoring, server administration, etc. +- `kaizenflow`: docs related to high-level packages that are used across the + codebase , as well as overall codebase organization. + - E.g., `helpers`, `config` +- `marketing` +- `monitoring` +- `oms` +- `onboarding`: practicalities of on-boarding new team members + - E.g., things typically done only once at the beginning of joining the team +- `trading_ops`: docs related to placing and monitoring trading orders to market + or broker +- `work_organization`: how the work is organized on a general level + - E.g., the company's adopted practices spanning coding and development +- `work_tools`: how to set up, run and use various software needed for + development + - E.g., IDE + +Last review: GP on 2024-08-11 diff --git a/docs/all.software_components.reference.md b/docs/all.software_components.reference.md index f6c2b5ba31..decacfe5e7 100644 --- a/docs/all.software_components.reference.md +++ b/docs/all.software_components.reference.md @@ -1,4 +1,4 @@ - +# Software Components @@ -156,18 +156,18 @@ process_forecasts() HistoricalDataSource ``` -# Conventions +## Conventions - A dir is a C4 container (level 2) - A subdir is a C4 component (level 3) - A class is a C4/UML class (level 4) - We use the same level of header for each of these C4 levels -# `DataPull` +## `DataPull` -## Extract +### Extract -### `Extractor` +#### `Extractor` - File: im_v2/common/data/extract/extractor.py - Responsibilities: abstract class for downloading raw data from vendors @@ -182,9 +182,9 @@ classDiagram Extractor <|-- CryptoChassisExtractor ``` -## QA +### QA -### `QaCheck` +#### `QaCheck` - File: sorrentum_sandbox/common/validate.py - Responsibilities: QA check on one or more datasets @@ -205,7 +205,7 @@ classDiagram QaCheck <|-- DuplicateDifferingOhlcvCheck ``` -### `DataSetValidator` +#### `DataSetValidator` - File: sorrentum_sandbox/common/validate.py - Responsibilities: Apply a set of QA checks to validate one or more datasets @@ -220,23 +220,23 @@ classDiagram DataSetValidator <|-- DataFrameDatasetValidator ``` -## Transform +### Transform -## DB +### DB -### `DbConnectionManager` +#### `DbConnectionManager` -### `TestImDbHelper` +#### `TestImDbHelper` -## Universe +### Universe -### `FullSymbol` +#### `FullSymbol` - Responsibilities: implement full symbol (e.g., `binance:BTC_USDT`) -## Client +### Client -### `ImClient` +#### `ImClient` - Responsibilities: adapts the data from a vendor to the MarketData format (i.e., a wide format with knowledge time) @@ -258,66 +258,66 @@ classDiagram RealTimeImClient <|-- SqlRealTimeImClient ``` -### `DataFrameImClient` +#### `DataFrameImClient` - Responsibilities: read data from a passed dataframe - This is used for synthetic data -### `HistoricalPqByTileClient` +#### `HistoricalPqByTileClient` - Responsibilities: read historical data stored as Parquet by-tile -### `HistoricalPqByCurrencyPairTileClient` +#### `HistoricalPqByCurrencyPairTileClient` - Responsibilities: read historical data stored as Parquet by asset -### `HistoricalPqByDateClient` +#### `HistoricalPqByDateClient` - Responsibilities: read historical data stored as Parquet by tile -### `RealTimeImClient` +#### `RealTimeImClient` - Responsibilities: type representing a real-time client -### `SqlRealTimeImClient` +#### `SqlRealTimeImClient` - Responsibilities: read data from a table of an SQL DB -### `ImClientTestCase` +#### `ImClientTestCase` - Responsibilities: help test for classes derived from `ImClient` -### `RawDataReader`read data from a table of an SQL DB +#### `RawDataReader`read data from a table of an SQL DB - Responsibilities: read data based on a dataset signature -# `market_data` +## `market_data` -### `MarketData` +#### `MarketData` - Responsibilities: - Interactions: - Main methods: -### `ImClientMarketData` +#### `ImClientMarketData` -### `MarketData_*_TestCase` +#### `MarketData_*_TestCase` -### `RealTimeMarketData` +#### `RealTimeMarketData` -### `RealTimeMarketData2` +#### `RealTimeMarketData2` -### `ReplayedMarketData` +#### `ReplayedMarketData` -### `HorizontalStitchedMarketData` +#### `HorizontalStitchedMarketData` -### `IgStitchedMarketData` +#### `IgStitchedMarketData` -# `dataflow` +## `dataflow` -# `dataflow/core` +## `dataflow/core` -### `Node` +#### `Node` - Responsibilities: - Store and retrieve its output values on a per-method (e.g., "fit" and @@ -330,7 +330,7 @@ classDiagram - Main methods: -### `DAG` +#### `DAG` - Responsibilities: - Build a DAG of `Nodes` by adding and connecting `Node`s @@ -343,7 +343,7 @@ classDiagram - Main methods: -### `DagBuilder` +#### `DagBuilder` - Responsibilities: - Abstract class for creating DAGs @@ -353,14 +353,14 @@ classDiagram customized when building the `DAG` - `get_dag()`: builds the `DAG` -### `DagRunner` +#### `DagRunner` - Responsibilities: - Run a `DAG` by calling a `Method` on all the nodes - Interactions: - Main methods: -### `FitPredictDagRunner` +#### `FitPredictDagRunner` - Responsibilities: - Run a `DAG` with `fit`, `predict` methods @@ -370,7 +370,7 @@ classDiagram on - `fit()`, `predict()` to run the corresponding methods -### `RollingFitPredictDagRunner` +#### `RollingFitPredictDagRunner` - Responsibilities: - Run a `DAG` by periodic fitting on previous history and evaluating on new @@ -378,73 +378,73 @@ classDiagram - Interactions: - Main methods: -### `ResultBundle` +#### `ResultBundle` - Responsibilities: - Store `DAG` execution results. - Interactions: - Main methods: -# dataflow/core/nodes +## dataflow/core/nodes -### `FitPredictNode` +#### `FitPredictNode` - Abstract node implementing `fit()` / `predict()` function - Store and load state -### `DataSource` +#### `DataSource` - DataSource <|-- FitPredictNode - Abstract - Generate train/test data from the passed data frame -### `Transformer` +#### `Transformer` - FitPredictNode <|-- DataSource - Abstract - Single-input single-output node calling a stateless transformation -### `YConnector` +#### `YConnector` - FitPredictNode <|-- YConnector - Create an output df from two input dataframes -### `GroupedColDfToDfColProcessor` +#### `GroupedColDfToDfColProcessor` -### `CrossSectionalDfToDfColProcessor` +#### `CrossSectionalDfToDfColProcessor` - Wrappers for cross-sectional transformations -### `SeriesToDfColProcessor` +#### `SeriesToDfColProcessor` - Series-to-dataframe wrapper -### `SeriesToSeriesColProcessor` +#### `SeriesToSeriesColProcessor` - Series-to-series wrapper -### `DfStacker` +#### `DfStacker` - Stack and unstack dataframes with identical columns -### +#### -# dataflow/system +## dataflow/system -### `RealTimeDagRunner` +#### `RealTimeDagRunner` - Run a DAG in real-time -### `ProcessForecastsNode` +#### `ProcessForecastsNode` -### `HistoricalDataSource` +#### `HistoricalDataSource` - Adapt a `MarketData` object to a DAG - Store and load the state of the node. -### `RealTimeDataSource` +#### `RealTimeDataSource` -### `System` +#### `System` - Responsibilities: abstract class that builds a `System` - Interactions: there are several derived classes that allow to build various @@ -462,25 +462,25 @@ classDiagram ``` - Main methods: -### `ForecastSystem` +#### `ForecastSystem` -### `Df_ForecastSystem` +#### `Df_ForecastSystem` -### `NonTime_ForecastSystem` +#### `NonTime_ForecastSystem` -### `Time_ForecastSystem` +#### `Time_ForecastSystem` -### `ForecastSystem_with_DataFramePortfolio` +#### `ForecastSystem_with_DataFramePortfolio` -### `Time_ForecastSystem_with_DataFramePortfolio` +#### `Time_ForecastSystem_with_DataFramePortfolio` -### `Time_ForecastSystem_with_DatabasePortfolio_and_OrderProcessor` +#### `Time_ForecastSystem_with_DatabasePortfolio_and_OrderProcessor` -### `Time_ForecastSystem_with_DatabasePortfolio` +#### `Time_ForecastSystem_with_DatabasePortfolio` -# dataflow/backtest +## dataflow/backtest -### Forecaster +#### Forecaster - It is a DAG system that forecasts the value of the target economic quantities (e.g., @@ -491,7 +491,7 @@ for each asset in the target - Interactions: - Main methods: -### `MarketOms` +#### `MarketOms` MarketOms is the interface that allows to place orders and receive back fills to the specific target market. This is provided as-is and it's not under control of @@ -499,15 +499,15 @@ the user or of the protocol - E.g., a specific exchange API interface -### `OrderProcessor` +#### `OrderProcessor` - TODO(gp): Maybe MockedMarketOms since that's the actual function? -### `OmsDb` +#### `OmsDb` Simulation -### `ImplementedBroker` +#### `ImplementedBroker` - `submit_orders()` - Save files in the proper location @@ -520,21 +520,21 @@ Mocked system - Our implementation of the implemented system where we replace DB with a mock - The mocked DB should be as similar as possible to the implemented DB -### `DatabaseBroker` +#### `DatabaseBroker` - `submit_orders()` - Same behavior of `ImplementedBroker` but using `OmsDb` -### `OmsDb` +#### `OmsDb` - `submitted_orders` table (mocks S3) - Contain the submitted orders - `accepted_orders` table - `current_position` table -# oms/fill +## oms/fill -### `Fill` +#### `Fill` - Responsibilities: - Represent an order fill @@ -542,14 +542,14 @@ Mocked system - `Order` - Main methods: -# oms/order +## oms/order -### `Order` +#### `Order` - Responsibilities: - Represent an order to be executed over a period of time -# oms/broker +## oms/broker ```mermaid classDiagram @@ -562,7 +562,7 @@ Broker <|-- AbstractCcxtBroker : Inheritance AbstractCcxtBroker <|-- CcxtBroker : Inheritance ``` -### `Broker` +#### `Broker` - Description - A `Broker` is an object to place orders to the market and to receive fills, @@ -589,25 +589,25 @@ AbstractCcxtBroker <|-- CcxtBroker : Inheritance - `submit_orders()`: submit orders to the trading exchange - `get_fills()` -### `FakeFillsBroker` +#### `FakeFillsBroker` - Responsibilities: - Interactions: - Main methods: -### `DataFrameBroker` +#### `DataFrameBroker` - Responsibilities: - Interactions: - Main methods: -### `DatabaseBroker` +#### `DatabaseBroker` - Responsibilities: - Interactions: - Main methods: -### `ReplayedDataReader` +#### `ReplayedDataReader` - Responsibilities: - Replay data from an actual `RawDataReader` @@ -615,7 +615,7 @@ AbstractCcxtBroker <|-- CcxtBroker : Inheritance - Derived from `DataFrameBroker` - Main methods: -### `ReplayedFillsDataFrameBroker` +#### `ReplayedFillsDataFrameBroker` - Responsibilities: - Replay the fills from a Broker @@ -623,7 +623,7 @@ AbstractCcxtBroker <|-- CcxtBroker : Inheritance - Derived from `DataFrameBroker` - Main methods: -# oms/broker/ccxt +## oms/broker/ccxt ```mermaid classDiagram @@ -631,7 +631,7 @@ classDiagram AbstractCcxtBroker <|-- CcxtBroker : Inheritance ``` -### `AbstractCcxtBroker` +#### `AbstractCcxtBroker` - Responsibilities: - Retrieve broker configuration, market data (including CCXT), open positions, @@ -661,7 +661,7 @@ classDiagram - `get_total_balance()`: Retrieves, validates, and logs the total available balance from an exchange. -### `CcxtBroker` +#### `CcxtBroker` - Responsibilities: - Manage CCXT interactions, submit orders, handle cancellations, and sync with @@ -677,7 +677,7 @@ classDiagram - `_get_ccxt_order_structure()`: Get the CCXT order structure corresponding to the submitted order. -# oms/limit_computer +## oms/limit_computer ```mermaid classDiagram @@ -685,7 +685,7 @@ classDiagram AbstractLimitPriceComputer <|-- LimitPriceComputerUsingVolatility : Inheritance ``` -### `AbstractLimitPriceComputer` +#### `AbstractLimitPriceComputer` - Responsibilities: - Provide methods to retrieve timestamp data, extract latest bid/ask sizes, @@ -702,7 +702,7 @@ classDiagram latest/mean bid/ask price. - `normalize_bid_ask_data()`: Validate and normalize the bid ask data. -### `LimitPriceComputerUsingSpread` +#### `LimitPriceComputerUsingSpread` - Responsibilities: - Retrieve, compare latest and average bid/ask prices. @@ -717,7 +717,7 @@ classDiagram data and uses a `passivity_factor` to adjust the limit price between the bid and ask prices. -### `LimitPriceComputerUsingVolatility` +#### `LimitPriceComputerUsingVolatility` - Responsibilities: - Compute limit price based on volatility multiple @@ -732,7 +732,7 @@ classDiagram data and uses a `volatility_multiple` to adjust the limit price based on the volatility of the bid and ask prices. -# oms/child_order_quantity_computer +## oms/child_order_quantity_computer ```mermaid classDiagram @@ -740,7 +740,7 @@ classDiagram AbstractChildOrderQuantityComputer <|-- StaticSchedulingChildOrderQuantityComputer : Inheritance ``` -### `AbstractChildOrderQuantityComputer` +#### `AbstractChildOrderQuantityComputer` - Responsibilities: - Represent strategy to decide child order quantities within a parent order @@ -755,7 +755,7 @@ classDiagram - `update_current_positions()`: Update the current positions using data from the Broker. -### `DynamicSchedulingChildOrderQuantityComputer` +#### `DynamicSchedulingChildOrderQuantityComputer` - Responsibilities: - Place each child order wave with the remaining amount to fill. @@ -773,7 +773,7 @@ classDiagram target_position - open_position ``` -### `StaticSchedulingChildOrderQuantityComputer` +#### `StaticSchedulingChildOrderQuantityComputer` - Responsibilities: - Generate a TWAP-like schedule for placing child orders. @@ -787,9 +787,9 @@ classDiagram quantities for each provided parent order. The quantity is static, so it is calculated only once. -# oms/portfolio +## oms/portfolio -### `Portfolio` +#### `Portfolio` - A Portfolio stores information about asset and cash holdings of a System over time. @@ -818,7 +818,7 @@ classDiagram - We are trying not to mix static typing and duck typing - CASH_ID, `_compute_statistics()` goes in `Portolio` -### `DataFramePortfolio` +#### `DataFramePortfolio` - An implementation of a Portfolio backed by a DataFrame. This is used to simulate a system on an order-by-order basis. This should be equivalent to @@ -833,7 +833,7 @@ classDiagram - Update the holdings with fills -> `SimulatedBroker.get_fills()` - To make the simulated system closer to the implemented -### `DatabasePortfolio` +#### `DatabasePortfolio` an implementation of a Portfolio backed by an SQL Database to simulate systems where the Portfolio state is held in a database. This allows to simulate a @@ -842,7 +842,7 @@ system on an order-by-order basis. - `get_holdings()` - Same behavior of `ImplementedPortfolio` but using `OmsDb` -### ImplementedPortfolio +#### ImplementedPortfolio - `get_holdings()` - Check self-consistency and assumptions @@ -851,9 +851,9 @@ system on an order-by-order basis. - `update_state()` - No-op since the portfolio is updated automatically -# oms/order_processing +## oms/order_processing -### `OrderProcessor` +#### `OrderProcessor` - Monitor `OmsDb.submitted_orders` - Update `OmsDb.accepted_orders` @@ -861,7 +861,7 @@ system on an order-by-order basis. - TODO(gp): Unclear where it is used? -### `process_forecasts` +#### `process_forecasts` - Responsibilities: - Process all the forecasts from `prediction_df` using @@ -873,7 +873,7 @@ system on an order-by-order basis. different optimization conditions, spread, and restrictions, without running the Forecaster -### `TargetPositionAndOrderGenerator` +#### `TargetPositionAndOrderGenerator` - Responsibilities: - Retrieve the current holdings from `Portfolio` @@ -911,12 +911,12 @@ system on an order-by-order basis. - For IS it is different - It should not use any concrete implementation but only `Abstract\*` -## Locates +### Locates -## Restrictions +### Restrictions -# oms/optimizer +## oms/optimizer -# oms/db +## oms/db -# oms/ccxt +## oms/ccxt diff --git a/docs/all.workflow.explanation.md b/docs/all.workflow.explanation.md index d26f9c1e69..89de9c350a 100644 --- a/docs/all.workflow.explanation.md +++ b/docs/all.workflow.explanation.md @@ -1,4 +1,4 @@ - +# Workflow @@ -33,7 +33,7 @@ -# KaizenFlow workflow explanation +## KaizenFlow workflow explanation This document is a roadmap of most activities that Quants, Quant devs, and DevOps can perform using `KaizenFlow`. @@ -44,7 +44,7 @@ notebooks) in the repo. A high-level description of KaizenFlow is [KaizenFlow White Paper](/papers/DataFlow_stream_computing_framework/DataFlow_stream_computing_framework.pdf) -# Work organization +## Work organization - Issues workflow explained [`amp/docs/work_organization/ck.issue_workflow.explanation.md`](/docs/work_organization/ck.issue_workflow.explanation.md) @@ -52,11 +52,11 @@ A high-level description of KaizenFlow is [`/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md`](/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md) - TODO(Grisha): add more from `/docs/work_organization/`. -# Set-up +## Set-up - TODO(gp): Add pointers to the docs we ask to read during the on-boarding -## Documentation_meta +### Documentation_meta - The dir `docs/documentation_meta` contains documents about writing the documentation @@ -79,7 +79,7 @@ A high-level description of KaizenFlow is -# Quant workflows +## Quant workflows The life of a Quant is spent between: @@ -107,7 +107,7 @@ These activities are mapped in `KaizenFlow` as follows: -## `DataPull` +### `DataPull` - General intro to `DataPull` - [/docs/datapull/ck.datapull.explanation.md](/docs/datapull/ck.datapull.explanation.md) @@ -116,7 +116,7 @@ These activities are mapped in `KaizenFlow` as follows: - [/docs/datapull/all.datapull_sandbox.explanation.md](/docs/datapull/all.datapull_sandbox.explanation.md) - [/docs/datapull/ck.ccxt_exchange_timestamp_interpretation.reference.md](/docs/datapull/ck.ccxt_exchange_timestamp_interpretation.reference.md) -### Universe +#### Universe - Universe explanation - [/docs/datapull/ck.universe.explanation.md](/docs/datapull/ck.universe.explanation.md) @@ -125,7 +125,7 @@ These activities are mapped in `KaizenFlow` as follows: - [/im_v2/common/universe/notebooks/Master_universe_analysis.ipynb](/im_v2/common/universe/notebooks/Master_universe_analysis.ipynb) - [/im_v2/ccxt/notebooks/Master_universe.ipynb](/im_v2/ccxt/notebooks/Master_universe.ipynb) -### Dataset signature +#### Dataset signature - Organize and label datasets - Helps to uniquely identify datasets across different sources, types, @@ -152,7 +152,7 @@ These activities are mapped in `KaizenFlow` as follows: generalized for more sources - Download data in real time over a given time interval - - [/im_v2/ccxt/data/extract/download_exchange_data_to_db_periodically.py](/im_v2/ccxt/data/extract/download_exchange_data_to_db_periodically.py) + - [/im_v2/common/data/extract/periodic_download_exchange_data_to_db.py](/im_v2/common/data/extract/periodic_download_exchange_data_to_db.py) - Archive data - Helps with optimizing data storage performance/costs by transferring older @@ -209,9 +209,9 @@ These activities are mapped in `KaizenFlow` as follows: -## `DataFlow` +### `DataFlow` -### Meta +#### Meta - Best practices for Quant research - [/docs/dataflow/ck.research_methodology.explanation.md](/docs/dataflow/ck.research_methodology.explanation.md) @@ -222,7 +222,7 @@ These activities are mapped in `KaizenFlow` as follows: - TODO(Grisha): does this belong to `DataFlow`? - TODO(Grisha): `ck.master_notebooks...` -> `all.master_notebooks`? -### DAG +#### DAG - General concepts of `DataFlow` - Introduction to KaizenFlow, DAG nodes, DataFrame as unit of computation, DAG @@ -312,7 +312,7 @@ These activities are mapped in `KaizenFlow` as follows: - [/docs/dataflow/ck.load_alpha_and_trades.tutorial.py](/docs/dataflow/ck.load_alpha_and_trades.tutorial.py) - TODO(gp): add more comments -### System +#### System - Learn how to build `System` - TODO(gp): @grisha what do we have for this? @@ -368,11 +368,11 @@ These activities are mapped in `KaizenFlow` as follows: -# Quant dev workflows +## Quant dev workflows -## DataPull +### DataPull - Learn how to create a `DataPull` adapter for a new data source - [/docs/datapull/all.dataset_onboarding_checklist.reference.md](/docs/datapull/all.dataset_onboarding_checklist.reference.md) @@ -413,39 +413,39 @@ These activities are mapped in `KaizenFlow` as follows: -## DataFlow +### DataFlow - All software components - [/docs/dataflow/ck.data_pipeline_architecture.reference.md](/docs/dataflow/ck.data_pipeline_architecture.reference.md) -# TradingOps workflows +## TradingOps workflows -## Trading execution +### Trading execution -### Intro +#### Intro - Binance trading terms - [/docs/oms/broker/ck.binance_terms.reference.md](/docs/oms/broker/ck.binance_terms.reference.md) -### Components +#### Components - OMS explanation - [/docs/oms/ck.oms.explanation.md](/docs/oms/ck.oms.explanation.md) - CCXT log structure - [/docs/oms/broker/ck.ccxt_broker_logs_schema.reference.md](/docs/oms/broker/ck.ccxt_broker_logs_schema.reference.md) -### Testing +#### Testing - Replayed CCXT exchange explanation - [/docs/oms/broker/ck.replayed_ccxt_exchange.explanation.md](/docs/oms/broker/ck.replayed_ccxt_exchange.explanation.md) - How to generate broker test data - [/docs/oms/broker/ck.generate_broker_test_data.how_to_guide.md](/docs/oms/broker/ck.generate_broker_test_data.how_to_guide.md) -### Procedures +#### Procedures - Trading procedures (e.g., trading account information) - [/docs/trading_ops/ck.trading.how_to_guide.md](/docs/trading_ops/ck.trading.how_to_guide.md) @@ -456,7 +456,7 @@ These activities are mapped in `KaizenFlow` as follows: -# MLOps workflows +## MLOps workflows - Encrypt a model - [/docs/dataflow/ck.release_encrypted_models.explanation.md](/docs/dataflow/ck.release_encrypted_models.explanation.md) @@ -464,7 +464,7 @@ These activities are mapped in `KaizenFlow` as follows: -## Deploying +### Deploying - Model deployment in production - [/docs/deploying/all.model_deployment.how_to_guide.md](/docs/deploying/all.model_deployment.how_to_guide.md) @@ -475,7 +475,7 @@ These activities are mapped in `KaizenFlow` as follows: -## Monitoring +### Monitoring - Monitor system - [/docs/monitoring/ck.monitor_system.how_to_guide.md](/docs/monitoring/ck.monitor_system.how_to_guide.md) @@ -486,7 +486,7 @@ These activities are mapped in `KaizenFlow` as follows: -# DevOps workflows +## DevOps workflows The documentation outlines the architecture and deployment processes for the Kaizen Infrastructure, leveraging a blend of AWS services, Kubernetes for @@ -495,7 +495,7 @@ Emphasizing Infrastructure as Code (IaC), the project employs Terraform for provisioning and Ansible for configuration, ensuring a maintainable and replicable environment. -## Overview +### Overview - Development and deployment stages - [/docs/infra/ck.development_stages.explanation.md](/docs/infra/ck.development_stages.explanation.md) @@ -505,7 +505,7 @@ replicable environment. - This document provides an overview of the S3 buckets utilized by Kaizen Technologies. -## Current set-up description +### Current set-up description - Document details steps for setting up Kaizen infrastructure - [/docs/infra/ck.kaizen_infrastructure.reference.md](/docs/infra/ck.kaizen_infrastructure.reference.md) @@ -513,7 +513,7 @@ replicable environment. - EC2 servers overview - [/docs/infra/ck.ec2_servers.explanation.md](/docs/infra/ck.ec2_servers.explanation.md) -## Set up infra +### Set up infra - Document the implementation of Auto Scaling in the Kubernetes setup, focusing on the Cluster Autoscaler (CA), Horizontal Pod Autoscaler (HPA), and Auto diff --git a/docs/build/all.linter_gh_workflow.explanation.md b/docs/build/all.linter_gh_workflow.explanation.md index b69568516e..facb01dff2 100644 --- a/docs/build/all.linter_gh_workflow.explanation.md +++ b/docs/build/all.linter_gh_workflow.explanation.md @@ -1,4 +1,6 @@ -# Linter Github Action Workflow Explanation +# Linter Gh Workflow + +## Linter Github Action Workflow Explanation @@ -9,7 +11,7 @@ -# Overview +## Overview - We want to use linter for all the new code that needs to be merged into the `master` branch @@ -19,9 +21,9 @@ did not run before. - In this case, the workflow will fail, and will not allow the PR to be merged -# How it works +## How it works -## Fetch master branch +### Fetch master branch In order to compare the changed files in the PR with the latest master branch, fetch the latest master, e.g., @@ -30,7 +32,7 @@ fetch the latest master, e.g., invoke git_fetch_master ``` -## Run the linter and check the linter results +### Run the linter and check the linter results - Run the linter against the changed files in the PR branch diff --git a/docs/build/all.pytest_allure.explanation.md b/docs/build/all.pytest_allure.explanation.md index 2c19fcc8e4..723ac28858 100644 --- a/docs/build/all.pytest_allure.explanation.md +++ b/docs/build/all.pytest_allure.explanation.md @@ -1,4 +1,6 @@ -# Pytest Allure Explanantion +# Pytest Allure + +## Pytest Allure Explanantion @@ -15,7 +17,7 @@ -# Overview +## Overview - Allure Report boosts collaboration and project quality by providing clear, detailed test reports that aid issue resolution for different team members @@ -39,32 +41,32 @@ end A1 ==> B1 ==> C1[View the report in the browser] ``` -# Core features +## Core features -## Rich and Interactive Reports: +### Rich and Interactive Reports: - Allure generates visually appealing and interactive HTML reports, making it easy to analyze test results - Reports include detailed information about test cases, steps, attachments, and more -## Annotations and Labels: +### Annotations and Labels: - Allure uses annotations and labels to provide additional information about test methods, making it easier to understand and categorize test results - Annotations are used to mark and describe test methods, and labels help in categorizing and filtering tests -## Test History and Trends +### Test History and Trends - Allure maintains a history of test runs, allowing you to track changes in test results over time - Trends and statistics help identify patterns, improvements, or regressions in the application's behavior -# Key Components +## Key Components -## Pytest plugin for the Allure output generation +### Pytest plugin for the Allure output generation In order to generate the Allure output, we need to install the `allure-pytest` plugin. For the time and efforts saving reasons, we will install it on-the-fly @@ -73,13 +75,13 @@ in the container where we will run the tests. This feature is introduced as the `allure-pytest` plugin will be installed and the results will be stored in the specified directory. -## Allure reporting tool +### Allure reporting tool The CLI utility for generating Allure reports creates an HTML report from the Allure output. We'll install this utility using GitHub Actions workflow to generate the HTML report. -# How it works +## How it works - Allure Report is composed of a framework(pytest) adapter and the allure command-line utility @@ -102,7 +104,7 @@ generate the HTML report. - Use `allure generate` to generate the test report into the specified directory. For e.g.: `allure generate allure-report` -# Historical Trends +## Historical Trends A test report generated by Allure can not only display data about the latest test launch, but also help you compare it with the data from previous reports. @@ -110,7 +112,7 @@ To do so, Allure can keep a history or previous reports. In a tests report with the history included, you can: -- see what statuses did a test have previously (see Details panel → History tab) -- find tests that changed status since last report (see Sorting and filtering → +- See what statuses did a test have previously (see Details panel → History tab) +- Find tests that changed status since last report (see Sorting and filtering → Filter tests by marks) -- see how specific values change over time (see Graphs) +- See how specific values change over time (see Graphs) diff --git a/docs/build/all.pytest_allure.how_to_guide.md b/docs/build/all.pytest_allure.how_to_guide.md index 65c7108ba6..ec7abca61f 100644 --- a/docs/build/all.pytest_allure.how_to_guide.md +++ b/docs/build/all.pytest_allure.how_to_guide.md @@ -1,4 +1,6 @@ -# Pytest Allure How to Guide +# Pytest Allure + +## Pytest Allure How to Guide @@ -11,17 +13,19 @@ -# How to run the flow end-to-end via GH actions +## How to run the flow end-to-end via GH actions Considering that we run the tests on the `cmamp` repo with the fast `tests` group -**Important note**: Unlike usual test run, we don't stop the execution on failure. -For this we use the `continue-on-error: true` in the GitHub action step. +**Important note**: Unlike usual test run, we don't stop the execution on +failure. For this we use the `continue-on-error: true` in the GitHub action +step. -Here is the link to the [GitHub action file](../.github/workflows/DISABLED.allure.fast_test.yml). +Here is the link to the +[GitHub action file](../.github/workflows/DISABLED.allure.fast_test.yml). -# How to generate allure-pytest results +## How to generate allure-pytest results To save Allure results after a test run, append `--allure-dir` parameter to a `pytest` cmd, e.g., @@ -32,7 +36,7 @@ i run_fast_tests ... --allure-dir ./allure_results where `allure-dir` is the directory where the Allure results will be stored. -# How to backup the Allure results +## How to backup the Allure results To backup the Allure results, copy the `allure_results` directory to a AWS S3 bucket, e.g., @@ -48,7 +52,7 @@ where: - `cmamp` is the name of the GitHub repo - `fast` is the name of the tests group -# How to generate Allure HTML-report +## How to generate Allure HTML-report - Whenever Allure generates a test report in a specified directory i.e. `allure-report` (refer to the @@ -63,7 +67,6 @@ To install the Allure CLI utility, refer to the [official docs](https://allurereport.org/docs/gettingstarted-installation/). In order to generate the HTML report, run the following command: - ``` allure generate allure_results -o allure_report ``` @@ -75,14 +78,13 @@ where: TODO(Vlad): Come up with a clean-up strategy for the S3 bucket. -## Keep the history of test runs +### Keep the history of test runs - To activate the features related to history, copy the history subdirectory from the previous report into the latest test results directory before generating the subsequent test report - Here is an example of how to do it, assuming that your project is configured to use `allure-results` and `allure-report` directories: - - Make sure you have the previous report generated in the `allure-report` directory - Remove the `allure-results` directory @@ -91,11 +93,12 @@ TODO(Vlad): Come up with a clean-up strategy for the S3 bucket. - Generate the new report To copy the history subdirectory from the previous run to the `allure_results`: + ```bash aws s3 cp s3://cryptokaizen-html/allure_reports/cmamp/fast/report.20231120_102030/history allure_results/history --recursive ``` -# How to publish the Allure-HTML report +## How to publish the Allure-HTML report To publish the Allure report, copy the `allure_report` directory to a AWS S3 bucket, e.g., diff --git a/docs/coding/all.asyncio.explanation.md b/docs/coding/all.asyncio.explanation.md index 4f492c5752..012187eb45 100644 --- a/docs/coding/all.asyncio.explanation.md +++ b/docs/coding/all.asyncio.explanation.md @@ -24,8 +24,7 @@ practices to avoid common pitfalls. ## Nomenclature in asyncio - Event Loop - - - asyncio operates on an event loop that manages the execution of asynchronous + - Asyncio operates on an event loop that manages the execution of asynchronous tasks. Whenever one wants to execute the asynchronous tasks we do `asyncio.run()` or `asyncio.run_until_complete()`: both of these methods will start the event loop and it will be set to running yielding control to @@ -38,7 +37,6 @@ practices to avoid common pitfalls. compatibility with asyncio. - Coroutines (aka "async functions" defined with `async def`) - - These functions can be paused and resumed without blocking other tasks. - `await` @@ -58,13 +56,11 @@ error. To avoid this: - Solution 1: use `nest_asyncio` - - `nest_asyncio` is a library that allows you to create nested event loops. While this may seem like a solution but may lead to complex issues. This was mainly developed to run `asyncio` in Jupyter/ipython which already runs an event loop in backend. This library also does not support `asyncio_solipsism` so there is another trade-off. - - Here's how nest_asyncio works: - It saves the current event loop, if any, that is running in the environment. @@ -74,14 +70,13 @@ To avoid this: original event loop, ensuring compatibility with the environment. - Solution 2: use threads - - Instead of starting a new event loop, run that specific part of your code in a separate thread to prevent conflicts. This solves the issue but using thread has its own complications such as race conditions which can be difficult to debug - Solution 3: embrace "async all the way up" approach - - use `await` instead of nested call to `asyncio.run` and make your methods + - Use `await` instead of nested call to `asyncio.run` and make your methods asynchronous using `async def` all the way #### Example Code @@ -101,12 +96,10 @@ Consider the following coroutines style B fill:#98FB98, stroke:#2E8B57 style C fill:#ADD8E6, stroke:#4682B4 ``` - ``` import asyncio import helpers.hasyncio as hasynci - # Corresponds to `submit twap` in CmampTask5842 async def A(): print("IN A") @@ -114,14 +107,12 @@ Consider the following coroutines print("ENTER B") B() - # Corresponds to `get_fill_per_order` async def C(): print("IN C") await asyncio.sleep(2) print("EXIT C") - # get_fill def B(): print("IN B") @@ -129,13 +120,11 @@ Consider the following coroutines asyncio.get_running_loop().run_until_complete(cor) print("EXIT B") - # Call A. hasynci.run(A(), asyncio.get_event_loop(), close_event_loop=False) ``` - The code above won't work and will give - ``` Error: "Event loop is already running" ``` @@ -147,7 +136,6 @@ Consider the following coroutines already running' error." - Adding - ``` import nest_asyncio diff --git a/docs/coding/all.code_design.how_to_guide.md b/docs/coding/all.code_design.how_to_guide.md index 2509f01b7c..39f0db6cce 100644 --- a/docs/coding/all.code_design.how_to_guide.md +++ b/docs/coding/all.code_design.how_to_guide.md @@ -1,4 +1,4 @@ - +# Code Design @@ -22,9 +22,9 @@ -# Design Philosophy +## Design Philosophy -## Measure seven times, cut once (Russian proverb) +### Measure seven times, cut once (Russian proverb) - Before doing any work, sit down and plan - Describe somewhere _in writing_ your high-level plan. Put it in a Google doc @@ -76,13 +76,13 @@ - Do not disappear for one week and come back with something that makes sense only to you, or that you didn’t get buy-in from others on -## Hacker laws +### Hacker laws - A list of interesting "laws" (some are more rule of thumbs / heuristics) related to computing: - [hacker-laws](https://github.com/dwmkerr/hacker-laws) -## Keep it simple +### Keep it simple - Follow the [KISS principle](https://en.wikipedia.org/wiki/KISS_principle). - Pursue simple, elegant solutions. Some things are inherently complex, but even @@ -92,7 +92,7 @@ - Modify - Debug -## Tips from a pro +### Tips from a pro - Adapted from [these slides](https://www.slideshare.net/adrianionel/software-engineering-advice-from-googles-jeff-dean-for-big-distributed-systems) @@ -100,7 +100,7 @@ [Jeff Dean]() (the Chuck Norris of SWE) -### Designing software systems is tricky +#### Designing software systems is tricky - Need to balance: - Simplicity [note that this comes first!] @@ -110,7 +110,7 @@ - Generality - Features [note that this comes last!] -### Get Advice Early! +#### Get Advice Early! - Get advice - Before you write any code @@ -121,7 +121,7 @@ - Chat about the design with colleagues - Consider discussing multiple potential designs -### Interfaces +#### Interfaces - Think carefully about interfaces in your system! - Imagine other hypothetical clients trying to use your interface @@ -129,9 +129,9 @@ - Get feedback on your interfaces before implementing! - The best way to learn is to look at well-designed interfaces -# Architecture +## Architecture -## Use design patterns +### Use design patterns - [Design patterns](https://en.wikipedia.org/wiki/Software_design_pattern) are idioms or recipes for solving problems that commonly appear in software @@ -152,9 +152,9 @@ - Simplify the high-level picture of your code - Make it easier for other people to understand your code -# Functions +## Functions -## Avoid modifying the function input +### Avoid modifying the function input - If, for example, a function `f` accepts a dataframe `df` as its (sole) argument, then, ideally, `f(df)` will not modify `df`. If modifications are @@ -179,7 +179,7 @@ code in a notebook that will return the same results when re-executed out of order. -## Prefer pure functions by default +### Prefer pure functions by default - [Pure functions](https://en.wikipedia.org/wiki/Pure_function) have two key properties: @@ -200,7 +200,7 @@ pragmatic to dogmatically insist upon a functional style (especially in our domain and when using Python). -# Invariants +## Invariants From ./oms/architecture.md @@ -235,9 +235,9 @@ Invariants and conventions - The Optimizer only thinks in terms of dollar -## Our approach to doing things +### Our approach to doing things -### Roles and responsibilities +#### Roles and responsibilities - How to communicate - Telegram for urgent stuff or interactive things (ideally on a small group @@ -266,7 +266,7 @@ Invariants and conventions - E.g., `i gh_create_pr` - We like draft PRs to discuss architecture before unit testing -### Good practices +#### Good practices - Good Issue reports - What are you trying to achieve diff --git a/docs/coding/all.code_like_pragmatic_programmer.how_to_guide.md b/docs/coding/all.code_like_pragmatic_programmer.how_to_guide.md index 8a69fa5305..a0a82b66e7 100644 --- a/docs/coding/all.code_like_pragmatic_programmer.how_to_guide.md +++ b/docs/coding/all.code_like_pragmatic_programmer.how_to_guide.md @@ -1,4 +1,4 @@ - +# Code Like Pragmatic Programmer @@ -185,18 +185,18 @@ -# A pragmatic philosophy +## A pragmatic philosophy -## PP_Tip 1: Care about your craft +### PP_Tip 1: Care about your craft - Why spending your life developing software unless you care doing it well? -## PP_Tip 2: Think! About your work +### PP_Tip 2: Think! About your work - Turn off the autopilot and take control - Constantly critique and evaluate your work -## PP_Tip 3: Provide options, don't make lame excuses +### PP_Tip 3: Provide options, don't make lame excuses - Things go wrong: - Deliverables are late @@ -211,18 +211,18 @@ - "Did you consider that?" - ... -## Broken windows +### Broken windows - = bad designs, wrong decisions, poor code, rotten software -## PP_Tip 4: Don't live with broken windows +### PP_Tip 4: Don't live with broken windows - A broken window left un-repaired instills a sense of abandonment - Don't live with broken windows un-repaired - If there is no time, board it up - Take action to show that you are on top of the situation -## PP_Tip 5: Be a catalyst for change +### PP_Tip 5: Be a catalyst for change - Sometimes you know what is right, but if you ask for permissions you will be slowed down @@ -232,18 +232,18 @@ - It's easier to ask forgiveness, than it is to get permission -## PP_Tip 6: Remember the big picture +### PP_Tip 6: Remember the big picture - Projects slowly and inexorably get totally out of hand - Missing a deadline happens one day at a time -## PP_Tip 7: Make quality a requirement issue +### PP_Tip 7: Make quality a requirement issue - One of the requirements from the user should be "how good do you want the software to be?" - Good software today is better than perfect software tomorrow -## PP_Tip 8: Invest regularly in your knowledge portfolio +### PP_Tip 8: Invest regularly in your knowledge portfolio - Your knowledge and experience are your most important professional assets - Unfortunately they are expiring assets @@ -258,13 +258,13 @@ - Don't put all the technical eggs in one basket 4. Review and rebalance periodically -## PP_Tip 9: Critically analyze what you read and hear +### PP_Tip 9: Critically analyze what you read and hear - Beware of media hype - Beware of zealots who insist that their dogma provides the only answer - A best seller book is not necessarily a good book -## PP_Tip 10: It's both what you say and the way you say it +### PP_Tip 10: It's both what you say and the way you say it - Plan what you want to say: write an outline - Choose the right moment @@ -274,7 +274,7 @@ - Be a listener - Get back to people -## PP_Tip 10: Remember the WISDOM acrostic +### PP_Tip 10: Remember the WISDOM acrostic - What do you *W*ant them to learn? - What is their *I*nterest in what you've got to say? @@ -283,9 +283,9 @@ - Whom do you want to *O*wn the information? - How can you *M*otivate them to listen? -# A pragmatic approach +## A pragmatic approach -## PP_Tip 11: DRY - Don't repeat yourself +### PP_Tip 11: DRY - Don't repeat yourself - Every piece of information must have a: - Single @@ -294,7 +294,7 @@ - Information is duplicated in multiple places -> maintenance nightmare -## PP_Tip 11: Programs = knowledge +### PP_Tip 11: Programs = knowledge - Programs are made of knowledge (e.g., requirements, specifications, code, internal and external documentation) @@ -304,7 +304,7 @@ - The solution to a problem changes over time - E.g., tests show that an algorithm is not general or does not work -## PP_Tip 11: How does duplication arise? +### PP_Tip 11: How does duplication arise? - There are 4 I's of duplication: @@ -317,7 +317,7 @@ 4. Inter-developer - Multiple developers duplicate a piece of info -## PP_Tip 11: Imposed duplication +### PP_Tip 11: Imposed duplication - Multiple representations of the same piece of info - E.g., same info in two pieces of code written in different languages @@ -335,7 +335,7 @@ - Solution: at least warnings and errors are reported - Do not duplicate comments in headers and code -## PP_Tip 11: Inadvertent duplication +### PP_Tip 11: Inadvertent duplication - Un-normalized data structures - E.g., @@ -350,13 +350,13 @@ redundant - Solution: use accessors to read/write object attributes -## PP_Tip 11: Impatient duplication +### PP_Tip 11: Impatient duplication - The temptation is always to cut-and-paste code and then modify it - It increases the technical debt: shortcuts end up in long delays - Solution: re-factor and then change it -## PP_Tip 11: Inter-developer duplication +### PP_Tip 11: Inter-developer duplication - It is hard to detect @@ -367,21 +367,21 @@ to look before writing a routine) - Project librarian to supervise the reuse -## PP_Tip 12: Make it easy to reuse +### PP_Tip 12: Make it easy to reuse - If something is not easy to find, use, reuse, people won't reuse -## Orthogonality +### Orthogonality - = independence, decoupling -## PP_Tip 13: Eliminate effects between unrelated things +### PP_Tip 13: Eliminate effects between unrelated things - Systems are orthogonal when changes in one sub-system don't affect other sub-systems - E.g., helicopter controls are not orthogonal -## PP_Tip 13: Orthogonality: pros +### PP_Tip 13: Orthogonality: pros 1. Easier to change - Changes are localized @@ -393,7 +393,7 @@ - Easier to change design - Not being tied to a particular vendor / product / platform -## PP_Tip 13: How to maintain orthogonality +### PP_Tip 13: How to maintain orthogonality 1. Avoid global state (e.g., singletons) 2. Write shy code @@ -405,22 +405,22 @@ - Improve structure - Increase orthogonality -## PP_Tip 14: There are no final decisions +### PP_Tip 14: There are no final decisions - Do not carve decisions into stone - Think of them as being written on the sand -## PP_Tip 14: There are no final decisions: why +### PP_Tip 14: There are no final decisions: why 1. Requirements can change on us 2. The first decision is not always the best one 3. Change 3rd party components -## PP_Tip 15: Use tracer bullets to find the target +### PP_Tip 15: Use tracer bullets to find the target - Ready, fire, aim! -## Analogy between shooting bullets in the dark and software engineering +### Analogy between shooting bullets in the dark and software engineering - How to shoot a bullet to a target in the dark? @@ -442,29 +442,29 @@ - Achieve end-to-end connection among components with minimal functionality - Then adjust, re-aim, ... until you are on target -## PP_Tip 15: Tracer bullets: pros +### PP_Tip 15: Tracer bullets: pros - Users see something working early - You have an integration platform, instead of big-bang integration -## PP_Tip 15: Tracer bullets: cons +### PP_Tip 15: Tracer bullets: cons - Tracer bullets don't always hit their target - Still imagine the result using a waterfall approach -## PP_Tip 16: Prototype to learn +### PP_Tip 16: Prototype to learn - The value of prototype lies not in the code produced, but in the lessons learned -## PP_Tip 16: Prototype to learn: cons +### PP_Tip 16: Prototype to learn: cons - Set the right expectations - Make sure everyone understands that you are writing disposable code - Otherwise management might insist on deploying the prototype or a cleaned up version of it -## PP_Tip 16: Tracer code vs prototyping +### PP_Tip 16: Tracer code vs prototyping - Prototype: - When experimentation is done, the prototype is thrown away and it is @@ -477,7 +477,7 @@ - Over time stubbed routines are completed - The framework stays intact -## PP_Tip 17: Program close to the problem domain +### PP_Tip 17: Program close to the problem domain - Always try to write code using the vocabulary of the application domain - So you are free to concentrate on solving domain problems and ignore petty @@ -488,7 +488,7 @@ - Code can issue domain specific errors - Create metadata compiled or read-in by the main application -## PP_Tip 18: Estimate to avoid surprises +### PP_Tip 18: Estimate to avoid surprises - All answers are estimates: some are more accurate than others @@ -496,7 +496,7 @@ - The unit of measurement of the estimate conveys a message about accuracy - E.g., 130 working days vs 6 months -## PP_Tip 18: How to make estimates +### PP_Tip 18: How to make estimates 1. Ask people that have done a similar project before 2. Specify what's the scope of an estimate: "under assumptions X and Y, the @@ -507,7 +507,7 @@ properly 5. When estimates are wrong, understand why -## PP_Tip 19: Iterate the schedule with the code +### PP_Tip 19: Iterate the schedule with the code - Management often wants a single estimate for the schedule before the project starts @@ -517,14 +517,14 @@ - Productivity - Environment will determine the schedule -## PP_Tip 19: Estimates at the coffee machine +### PP_Tip 19: Estimates at the coffee machine - Estimates given at the coffee machine will come back to haunt you - When asked for an estimate, answer "I'll get back to you" -# The basic tools +## The basic tools -## PP_Tip 20: Keep knowledge in plain text +### PP_Tip 20: Keep knowledge in plain text - The base material for a programmer is knowledge @@ -543,7 +543,7 @@ - All other forms of data; and - The application that created it -## PP_Tip 21: Use the power of command shells +### PP_Tip 21: Use the power of command shells - GUIs - Are great @@ -553,18 +553,18 @@ - Shells allow to automate and combine tools in ways that one didn't intended or planned for -## PP_Tip 22: Use a single editor well +### PP_Tip 22: Use a single editor well - Better to know one editor very well than knowing many editors superficially - Use the same editor for all editing tasks -## PP_Tip 23: Always use source code control +### PP_Tip 23: Always use source code control - Source control is like an undo key, a time machine - Use it always: even if you are a single person team, even if you are working on a throw-away prototype -## PP_Tip 23: Advantages of source code control +### PP_Tip 23: Advantages of source code control - It allows to answer questions like: - Who made changes to this line of code? @@ -573,14 +573,14 @@ branches) - It can connect to automatic repeatable builds and regressions -## PP_Tip 24: Fix the problem, not the blame +### PP_Tip 24: Fix the problem, not the blame - No one writes perfect software so debugging will take up a major portion of your day - Attack debugging as a puzzle to be solved - Avoid denial, finger pointing, lame excuses -## PP_Tip 25: Don't panic +### PP_Tip 25: Don't panic - Before you start debugging, adopt the right mindset: - Turn off defenses that protect your ego @@ -589,7 +589,7 @@ - Don't panic while you debug, even if you have your nervous boss or your client breathing on your neck -## PP_Tip 25: How to debug +### PP_Tip 25: How to debug - Don't waste time thinking "this cannot happen" - Obviously it is happening @@ -607,22 +607,22 @@ - Corrupt variables? - Check their neighborhood variables, use `valgrind` -## PP_Tip 25: Rubber ducking +### PP_Tip 25: Rubber ducking - = explain the issue step-by-step to someone else, even to a yellow rubber duck -## PP_Tip 25: Why rubber ducking works? +### PP_Tip 25: Why rubber ducking works? - The simple act of explaining the issue often uncovers the problem - You state things that you may take for granted - Verbalizing your assumptions lets you gain new insights into the problem -## PP_Tip 26: `select` is not broken +### PP_Tip 26: `select` is not broken - Do not assume that the library is broken - Assume that you are calling the library in the wrong way -## PP_Tip 27: Don't assume it: prove it +### PP_Tip 27: Don't assume it: prove it - Don't assume that a piece of code works in any condition - Avoid statements like "that piece of code has been used for years: it cannot @@ -636,13 +636,13 @@ may be susceptible to the same bug? - Make sure that whatever happened, never happens again -## PP_Tip 28: Learn a text manipulation language +### PP_Tip 28: Learn a text manipulation language - Spending 30 mins trying out a crazy idea is better than spending 5 hours - With scripting languages (e.g., Python, perl) you can quickly prototype ideas instead of using a production language (e.g., C, C++) -## PP_Tip 29: Write code that writes code +### PP_Tip 29: Write code that writes code - **_Passive code generators_** - They are run once and the origin of the code is forgotten @@ -654,85 +654,85 @@ needed - This is not duplication, it is the DRY principle in action -# Pragmatic paranoia +## Pragmatic paranoia -## PP_Tip 30: You cannot write perfect software +### PP_Tip 30: You cannot write perfect software - Accept it and celebrate it - Unless you accept it, you'll end up wasting time chasing an impossible dream -## Know when to stop +### Know when to stop - A painter needs to know when to stop adding layers of paint -## Don't trust others +### Don't trust others - Code defensively - Anticipate the unexpected -## Don't trust yourself +### Don't trust yourself - Code defensively against your own mistakes -## Preconditions +### Preconditions - = what must be true in order for the routine to be called - It is caller's responsibility to pass good data -## Postconditions +### Postconditions - = what the routine is guaranteed to do -## Class invariants +### Class invariants - = a class ensures that some conditions are always true - E.g., between invocations to public methods -## Contracts +### Contracts - If all routines' preconditions are met - => the routine guarantees that all postconditions and invariants will be true when it completes -## PP_Tip 31: Design with contracts +### PP_Tip 31: Design with contracts - Some languages support design by contract: - In the compiler (e.g., static assertion, type system) - In the runtime systems (e.g., assertions) -## PP_Tip 32: Crash early +### PP_Tip 32: Crash early - Better to crash than to thrash (=corrupting the state of the system) - When something unexpected happens throw a runtime exception - The exception, if not caught, will percolate up to the top level halting the program -## PP_Tip 33: If it cannot happen, use assertions to ensure that it won't +### PP_Tip 33: If it cannot happen, use assertions to ensure that it won't - Assertions check for things that should never happen - E.g., at the end of a sorting routine the data is not sorted - Don't use assertions in place of real error handling -## Leave assertions enabled +### Leave assertions enabled - Assertions should be left on even after the system is tested and shipped - The assumption that testing will find all the bugs is wrong - Testing tests a miniscule percentage of possible real-world conditions -## PP_Tip 34: Use exceptions for exceptional problems +### PP_Tip 34: Use exceptions for exceptional problems - Interleaving normal control flow code and error handling code leads to ugly code - With exceptions one can split the code neatly into two parts - Exceptions are like `goto` -## Exceptions are for unexpected events +### Exceptions are for unexpected events - Use exceptions only for truly unexpected events - The code should still run in normal conditions if one removes all the exception handlers -## PP_Tip 35: Finish what you start +### PP_Tip 35: Finish what you start - Resources (e.g., memory, DB transactions, threads, files, timers) follow a pattern: @@ -744,9 +744,9 @@ - To avoid deadlocks always deallocate resources in the opposite order to that in which you allocate them -# Bend or break +## Bend or break -## PP_Tip 36: Minimize coupling between modules +### PP_Tip 36: Minimize coupling between modules - We want to limit the interaction between modules - If one modules has to be replaced / is broken, the other modules can carry @@ -754,7 +754,7 @@ - Traversing relationships between objects can quickly lead to a combinatorial explosion of dependencies -## PP_Tip 36: Law of Demeter for functions +### PP_Tip 36: Law of Demeter for functions - Any method `O.m(A, B, C)` of an object `O` should call only methods belonging to: @@ -765,7 +765,7 @@ - A rule of thumb in OOP is to use a single dot, e.g., `a.m()` and avoid multiple dots, e.g., `a.b.m()` -## PP_Tip 36: Intuition of the Law of Demeter +### PP_Tip 36: Intuition of the Law of Demeter - Object `A` can call a method of `B` - `A` cannot reach through `B` to access an object `C` @@ -773,7 +773,7 @@ - `B` needs to be changed to expose the interface of `C` - Cons: lots of wrapper methods to forward requests to delegates -## PP_Tip 36: Law of Demeter as general contractor +### PP_Tip 36: Law of Demeter as general contractor - It's like using a general contractor @@ -782,7 +782,7 @@ - Cons - The client needs to go through the general contractor all the times -## PP_Tip 37: Configure, don't integrate +### PP_Tip 37: Configure, don't integrate - Every time we change the code to accommodate a change in business logic we risk to break the system or to introduce a new bug @@ -792,7 +792,7 @@ database, middleware, user-interface style, ...) - Use `.ini` files -## PP_Tip 38: Put abstractions in code, details in metadata +### PP_Tip 38: Put abstractions in code, details in metadata - The goal is to think declaratively: specify what to do, not how to do it - We want to configure and drive the application via metadata as much as @@ -800,12 +800,12 @@ - So we program for the general case and put the specifics outside the compiled code -## Mechanisms vs policies +### Mechanisms vs policies - Mechanisms = primitives, what can be done - Policies = how to put together primitives -## PP_Tip 38: Advantages of splitting mechanisms and policies +### PP_Tip 38: Advantages of splitting mechanisms and policies 1. Decouple components in the design, resulting in more flexible and adaptable programs @@ -813,19 +813,19 @@ 3. Metadata can be expressed in a form closer to problem domain 4. Anybody can change the behavior without understanding the code -## PP_Tip 38: Put business logic in metadata +### PP_Tip 38: Put business logic in metadata - Business logic and rules are the parts that are most likely to change - So we want to maintain them in a flexible format, e.g., metadata - Metadata should be encoded in plain text -## PP_Tip 39: Analyze workflow to improve concurrency +### PP_Tip 39: Analyze workflow to improve concurrency - Avoid temporal coupling - Use activity diagrams to identify activities that could be performed in parallel -## PP_Tip 40: Temporal coupling +### PP_Tip 40: Temporal coupling - Time is often ignored when designing a software architecture - We tend to think in a linear sequential fashion: "do this, then do that" @@ -835,7 +835,7 @@ - Concurrency = things happening at the same time - Ordering = `A` must occur before `B` -## PP_Tip 40: UML activity diagram +### PP_Tip 40: UML activity diagram - Actions are represented by rounded boxes - Arrows between actions mean "temporal ordering" @@ -844,15 +844,15 @@ - Once all the actions leading to a barrier are done, one can proceed with the arrows leaving the synchronization point -## Service +### Service - = independent, concurrent objects behind well-defined, consistent interfaces -## PP_Tip 40: Design using services +### PP_Tip 40: Design using services - Using services allows to avoid temporal coupling -## PP_Tip 41: Hungry consumer model +### PP_Tip 41: Hungry consumer model - There are multiple independent consumers and a centralized work queue - Each consumer grabs a task from the work queue and processes it @@ -884,12 +884,12 @@ - Actions are asynchronous: as soon as a request is handled by a process and put on next queue, the process goes back to monitor inputs -## PP_Tip 41: Always design for concurrency +### PP_Tip 41: Always design for concurrency - Programming with threads imposes some design constraints - Concurrency forces you to think things more carefully -## Examples of problems with concurrency +### Examples of problems with concurrency - Global or static variables must be protected from concurrent access - Do you really need a global variable? @@ -899,11 +899,11 @@ - Interfaces should not keep state - Make services stateless -## Event +### Event - = special message that says "something interesting just happened" -## PP_Tip 42: Objects communicating through events +### PP_Tip 42: Objects communicating through events - We know that we need to separate a program in modules / classes - A module / class has a single, well-defined responsibility @@ -914,7 +914,7 @@ - By using events one can minimize coupling between objects: objects are interested in events and not in other objects -## PP_Tip 42: All events through a single routine approach +### PP_Tip 42: All events through a single routine approach - One approach is to send all events to a single routine that dispatches them to the objects @@ -923,7 +923,7 @@ - It's like a huge case statement - It violates encapsulation, increases coupling -## PP_Tip 42: Publish / subscribe model +### PP_Tip 42: Publish / subscribe model - Objects should only receive events they want (do no spam objects!) - Subscribers register themselves with publisher objects for interesting @@ -934,14 +934,14 @@ - Software bus (a centralized object maintains DB of listeners and dispatches messages) -## PP_Tip 42: Sequence diagram +### PP_Tip 42: Sequence diagram - It shows the flow of messages among several objects - Objects are arranged in columns - Each message is an arrow from sender's column to receiver's column - Timing relationship between messages is captured by vertical ordering -## PP_Tip 42: Push / pull model for event services +### PP_Tip 42: Push / pull model for event services - **_Push mode_** - Event producers inform the event channel that event has occurred @@ -952,7 +952,7 @@ - Event consumers poll the event channel periodically - Event channel polls possible suppliers and report interesting events -## PP_Tip 42: Separate views from models +### PP_Tip 42: Separate views from models - Model-View-Controller design pattern @@ -986,13 +986,13 @@ - A View looks for new world records - A View posts information on the web -## PP_Tip 43: Use blackboards to coordinate workflow +### PP_Tip 43: Use blackboards to coordinate workflow - A blackboard system lets decouple objects from each other completely - There is even less coupling that publish / subscribe model - Consumers and producers exchange data anonymously and asynchronously -## PP_Tip 43: Example of blackboard implementation +### PP_Tip 43: Example of blackboard implementation - Blackboard is like a DB providing atomic and distributed storage of objects @@ -1007,7 +1007,7 @@ - Advantage is a single and consistent interface to blackboard, instead of different APIs for every transaction / interaction in the system -## PP_Tip 43: Example of blackboard application +### PP_Tip 43: Example of blackboard application - Program that accepts and process loan applications @@ -1025,14 +1025,14 @@ - Any time new data arrives, a new rule is triggered - Rules can output more data on the blackboard, triggering more rules and so on -# While you are coding +## While you are coding -## PP_Tip 44: Don't program by coincidence +### PP_Tip 44: Don't program by coincidence - Do not program by coincidence (= relying on luck and accidental successes) - Program deliberately -## PP_Tip 44: How to program by coincidence +### PP_Tip 44: How to program by coincidence - You type some code, try it, and it seems to work - Type more code and still works @@ -1042,14 +1042,14 @@ - The code seemed to work, given the (limited) testing did, but it was just a coincidence -## PP_Tip 44: Why program by coincidence seems to work? +### PP_Tip 44: Why program by coincidence seems to work? - One ends up relying on undocumented boundary conditions and when the code is fixed / changed, our code breaks - One tries until something works, then it does not wonder why it works: "it works now, better leave it alone" -## PP_Tip 44: How to program deliberately +### PP_Tip 44: How to program deliberately - Rely only on reliable things - Document your assumptions (e.g., design by contract) @@ -1058,13 +1058,13 @@ unit tests) - Don't let existing code (even your own code) dictate future code -## PP_Tip 45: Estimate the order of your algorithms +### PP_Tip 45: Estimate the order of your algorithms - We always want to estimate the resources (e.g., time, memory) required by algorithms - E.g., "Would the algorithm scale up from 1k records to 1M records?" -## Big-Oh notation +### Big-Oh notation - Big-Oh notation represents the worst-case time taken by an algorithm - Simple loops: $O(n)$ @@ -1073,48 +1073,48 @@ - Divide and conquer (split, recurse, combine): $O(n \log(n))$ - Combinatoric: $O(2^n)$ -## Estimating Big-Oh +### Estimating Big-Oh - If you are not sure about Big Oh, vary input record size and plot the resource needed (e.g., time, memory) against the input size -## Big-Oh in the real world +### Big-Oh in the real world - It is possible that a $O(n^2)$ algorithm is faster than a $O(n \log(n))$ for small inputs - Even if runtime looks linear, the machine might start trashing for lack of memory and thus not scale linearly in the real world -## PP_Tip 46: Test your estimate +### PP_Tip 46: Test your estimate - It's tricky to get accurate execution times - Use code profilers to count the number of times different steps of your algorithm get executed and plot against the input size -## Be wary of premature optimization +### Be wary of premature optimization - Make sure an algorithm is really the bottleneck before investing precious time trying to improve it -## Refactoring +### Refactoring - = re-writing, re-working, re-architecting code -## PP_Tip 47: Refactor early, refactor often +### PP_Tip 47: Refactor early, refactor often - If you cannot refactor immediately - Make space in the schedule - File a bug - Limit the spread of the virus -## How to refactor +### How to refactor - Refactoring needs to be undertaken slowly, deliberately, and carefully - Don't refactor and add functionality at the same time - Make sure you have good tests before refactoring - Take baby steps to avoid prolonged debugging -## PP_Tip 47: Software development is more gardening than building +### PP_Tip 47: Software development is more gardening than building - Unfortunately the most common metaphor for software development is building construction: blueprints, build, release, maintenance, ... @@ -1129,7 +1129,7 @@ - Monitor health of the plants - Make adjustments -## PP_Tip 47: When to refactor? +### PP_Tip 47: When to refactor? - It is time to refactor when you notice: - Duplication @@ -1137,7 +1137,7 @@ - Outdated knowledge - Bad performance -## PP_Tip 47: Management and refactoring +### PP_Tip 47: Management and refactoring - How do you explain to your boss that "the code works, but it needs to be refactored?" @@ -1146,7 +1146,7 @@ - It's like accumulating debt: at some point it will need to be repaid, with interests! -## PP_Tip 48: Design to test +### PP_Tip 48: Design to test - Chips are designed to be tested - At the factory @@ -1161,7 +1161,7 @@ - Test each piece thoroughly (unit testing) before wiring them together (integration testing) -## PP_Tip 48: Testing against contract +### PP_Tip 48: Testing against contract - Write test cases that ensure that a unit honors its contract - This also checks whether the contract means what we think it means @@ -1169,12 +1169,12 @@ - We need to check over a wide range of test cases and boundary conditions - There's no better way to fix errors than by avoiding them in the first place -## Test-driven development +### Test-driven development - By building the tests before you implement the code, you try out the interface before you commit to it -## PP_Tip 48: Where to put unit tests? +### PP_Tip 48: Where to put unit tests? - Unit tests should be somehow close to the code they test - E.g., in a parallel directory @@ -1184,7 +1184,7 @@ - Examples of how to use a module - A means to validate any future changes to code -## PP_Tip 48: Provide a test harness +### PP_Tip 48: Provide a test harness - It's good to have a way to: - Select tests to run @@ -1198,7 +1198,7 @@ - There are standard test harness (e.g., `unittest` for Python, `cppunit` for C++) -## PP_Tip 48: Build a test backdoor +### PP_Tip 48: Build a test backdoor - No piece of software is perfect and bugs show up in real world - Have log files containing trace messages @@ -1207,13 +1207,13 @@ - Pointing to a certain port of the machine one can see internal status, log entries, a debug control panel -## PP_Tip 49: Test your software or your users will +### PP_Tip 49: Test your software or your users will - All software you write will be tested - If not by you, then by the eventual users - It is better to test it thoroughly than being swamped in help desk calls -## PP_Tip 50: Don't use wizard code you don't understand +### PP_Tip 50: Don't use wizard code you don't understand - Applications are getting harder and harder to write - User interfaces are becoming increasingly sophisticated @@ -1226,13 +1226,13 @@ - What we don't understand is behind a tidy interface - Wizard code is interwoven with our application -# Before the project +## Before the project -## Requirement +### Requirement - = a statement about something that needs to be accomplished -## PP_Tip 51: Don't gather requirements: dig for them +### PP_Tip 51: Don't gather requirements: dig for them - Gathering requirements implies that the requirements are already there - In reality they are buried deep beneath layers of @@ -1240,7 +1240,7 @@ - Misconceptions - Politics -## PP_Tip 51: Example of tricky requirements +### PP_Tip 51: Example of tricky requirements - A requirements can sound like: "Only an employee's supervisors and HR may view an employee's records" @@ -1252,7 +1252,7 @@ - Give the policy as an example of what should be supported - Policies should eventually go in the metadata of the application -## PP_Tip 52: Work with a user to think like a user +### PP_Tip 52: Work with a user to think like a user - We need to discover the underlying reason why users do a thing, rather than just the way they currently do it @@ -1262,24 +1262,24 @@ used - Build trust and establish communication with your users -## Use cases +### Use cases - = capture requirements through a particular use of the system -## PP_Tip 53: Use cases +### PP_Tip 53: Use cases - While sitting with the user, you see a few interesting scenarios that describe what the application needs to do - Write the scenarios in a document that everyone (developers, end users, project sponsor, management) can discuss -## PP_Tip 53: Abstractions live longer than details +### PP_Tip 53: Abstractions live longer than details - Good requirement docs should remain abstract - They reflect the business needs, not architecture, not design, not user interface, ... -## PP_Tip 53: Feature-itis +### PP_Tip 53: Feature-itis - Aka feature bloat - The issue is that by adding "just one more feature", the scope of the project @@ -1287,19 +1287,19 @@ - One should track the number of bugs reported and fixed, the number of additional features requested and who approved them -## Project glossary +### Project glossary - = one place that defines all the specific terms and vocabulary used in a project -## PP_Tip 54: Use a project glossary +### PP_Tip 54: Use a project glossary - It's hard to succeed if users and developers - Refer to the same thing with different names, or - Refer to different things with the same name - Create and maintain a project glossary -## PP_Tip 55: Don't think outside the box: find the box +### PP_Tip 55: Don't think outside the box: find the box - The secret to solve a puzzle is to identify the real (not imagined) constraints and find a solution therein @@ -1315,7 +1315,7 @@ - Don't dismiss anything, then explain why a certain path cannot be taken. Can you prove it? -## PP_Tip 55: Impossible problems +### PP_Tip 55: Impossible problems - You are stuck on a problem that seems "impossible" - You are late on the schedule @@ -1325,7 +1325,7 @@ - Does it have to be done this way? - Does it have to be done _at all_? -## PP_Tip 56: Listen to nagging doubts: start when you are ready +### PP_Tip 56: Listen to nagging doubts: start when you are ready - When you experience some reluctance when faced with a task, take notice - Sometimes your instinct is right on the spot, although you cannot put a finger @@ -1337,7 +1337,7 @@ - = process of reducing requirements to the point where coding can start - The goal is to remove major ambiguities -## PP_Tip 57: Some things are better done than described +### PP_Tip 57: Some things are better done than described - Program specification is an agreement with the user - It's important to stop increasing level of detail and start coding, @@ -1346,7 +1346,7 @@ - Once the system is running, users will ask for changes - Natural language is not expressive enough to clarify everything -## PP_Tip 58: Don't be a slave to formal methods +### PP_Tip 58: Don't be a slave to formal methods - Many methods have been developed to make programming more like engineering (e.g., waterfall development, UML, ...) @@ -1354,14 +1354,14 @@ - The user typically does not understand them and cannot provide feedback - It's better to give the users a prototype and let them play with it -## PP_Tip 59: Expensive tools do not produce better designs +### PP_Tip 59: Expensive tools do not produce better designs - Never give in into a methodology just because it is the hot new fad - Do not think about how much a tool costs when you look at its output -# Pragmatic projects +## Pragmatic projects -## PP_Tip 60: Pragmatic teams +### PP_Tip 60: Pragmatic teams - Most of the pragmatic programming principles apply to teams, as much as they apply to an individual @@ -1395,7 +1395,7 @@ - Different activities of a project (analysis, design, coding, testing) can't happen in isolation -## PP_Tip 60: Organize around functionality, not job functions +### PP_Tip 60: Organize around functionality, not job functions - Organize people in the same way one organizes code - Design by contract @@ -1412,7 +1412,7 @@ - Of course this approach works only with responsible developers and strong project management -## PP_Tip 61: Don't use manual procedures +### PP_Tip 61: Don't use manual procedures - We want to ensure consistency and repeatability in the project - Manual procedures leave consistency up to chance @@ -1422,29 +1422,29 @@ - We want to check out, build, test, ship with a single command - `make` and `cronjobs` are the solutions -## PP_Tip 62: Test early +### PP_Tip 62: Test early - Look for your bugs now, so you don't have to endure the shame of others finding your bugs later - Start testing as soon as you have code - Code a little, test a little -## PP_Tip 62: Test often +### PP_Tip 62: Test often - The earlier a bug is found, the cheaper it is to remedy -## PP_Tip 62: Test automatically +### PP_Tip 62: Test automatically - Tests that run with every build are better than test plans that sit on a shelf - A good project may well have more test code than production code -## PP Top 63: Coding ain't done 'til all the tests run +### PP Top 63: Coding ain't done 'til all the tests run - Just because you finished coding, you cannot tell that the code is done - You cannot claim that the code is done until it passes all the available tests - Code is never really done -## PP_Tip 63: What to test +### PP_Tip 63: What to test - There are multiple types of tests: - Unit @@ -1453,23 +1453,23 @@ - Performance - Usability tests -## Unit test +### Unit test - = exercise a module - If the parts don't work by themselves, they won't work together -## Integration test +### Integration test - = show that the major subsystems work well together - Integration is the largest source of bugs in the system - Test that the entire system honors its contract -## Validation and verification test +### Validation and verification test - = the users told you what they wanted, but is it what they really need? - A bug-free system that answers the wrong question is not useful -## Resource exhaustion, errors, and recovery test +### Resource exhaustion, errors, and recovery test - Resources are limited in the real world, e.g.: - Memory @@ -1479,21 +1479,21 @@ - Video resolution - How will the system behave under real-world conditions? -## Performance test +### Performance test - = testing under a given load (e.g., expected number of users, connections, transactions per second) - Does the system scale? -## Usability test +### Usability test - = performed with real users, under real environmental conditions -## PP_Tip 63: How to test +### PP_Tip 63: How to test - Run regression tests for all types of tests -## Regression testing +### Regression testing - = compare output of current test with previous known values - It ensures that fixes for today's bugs don't break things that were working @@ -1505,7 +1505,7 @@ - Performance - Usability tests -## Test data +### Test data - Test data is either real-world data or synthetic data - One needs to use both, since they expose different kind of bugs @@ -1514,31 +1514,31 @@ - Can have certain statistical properties (e.g., data to sort is already sorted or inversely sorted) -## Exercising GUI systems +### Exercising GUI systems - Often specialized testing tools are required, e.g., - Event capture / playback model - A data processing application with GUI front end should be decoupled so that one can test each component by itself -## Testing the tests +### Testing the tests - We cannot write perfect software - We need to test the tests and the test infrastructure -## Testing thoroughly +### Testing thoroughly - Use coverage analysis tools to keep track of which lines of the code have been / not been executed -## PP_Tip 64: Use saboteurs to test your testing +### PP_Tip 64: Use saboteurs to test your testing - If the system is a security system, test the system by trying to break in - After you have written a test to detect a particular bug, cause the bug deliberately and make sure the tests catch it - Write test for both positive and negative cases -## PP_Tip 65: Test state coverage, not code coverage +### PP_Tip 65: Test state coverage, not code coverage - Knowing that you executed all code does not tell you if you tested all states - This is a combinatorial problem @@ -1546,7 +1546,7 @@ - Boundary conditions - Structure of the code -## PP_Tip 65: When to test +### PP_Tip 65: When to test - As soon as any code exists, it must be tested - Testing should be done automatically as often as we can (e.g., before code is @@ -1555,14 +1555,14 @@ `not_ok` - Expensive / special tests can be run less frequently, but on a regular basis -## PP_Tip 66: Find bugs once +### PP_Tip 66: Find bugs once - Once a human tester finds a bug, a new test should be created to check for that bug every time - You don't want to keep chasing the same bugs that the automated tests could find for you -## PP_Tip 67: Treat English as just another programming language +### PP_Tip 67: Treat English as just another programming language - Embrace documentation as an integral part of software development - Keep the documentation in the code itself as much as possible @@ -1570,23 +1570,23 @@ - Apply all the principles learned for coding (DRY principle, orthogonality, ...) to English as well -## PP_Tip 68: Internal documentation +### PP_Tip 68: Internal documentation - Source code comments - Design documents - Test documents -## External documentation +### External documentation - = anything that is shipped or published to the outside world together with the software product (e.g., user manuals) -## Documentation vs code +### Documentation vs code - Documentation and code are different views of the same underlying model - If there is a discrepancy, the code is what matters -## PP_Tip 68: Comments +### PP_Tip 68: Comments - Code should have comments, but too many comments are as bad as too few - Comments should discuss \textit{why} something is done (e.g, engineering @@ -1596,14 +1596,14 @@ - Javadoc notation is useful (`\@param`, `\@return`, ...) to extract information from the code automatically -## Naming concepts +### Naming concepts - Variable names should be meaningful - Remember that you will be writing the code once, but reading it hundreds of time: avoid write-only code - Misleading names are worse than meaningless names -## PP_Tip 68: Automatically generated documentation +### PP_Tip 68: Automatically generated documentation - Also for documentation we want to use pragmatic principles - DRY principle @@ -1616,12 +1616,12 @@ - There should be a single command to generate and publish the documents on-line - Use a timestamp or review number for each page -## PP_Tip 69: Gently exceed your users' expectations +### PP_Tip 69: Gently exceed your users' expectations - Success of a project is measured by how well it meets the expectations of its users -## Examples of difference between actual and expected results +### Examples of difference between actual and expected results - A company announces record profits, and its share price drops 20\% - It didn't meet analysts' expectations @@ -1630,7 +1630,7 @@ - A team works miracles to implement a complex application - The users don't like it because it does not have an help system -## PP_Tip 69: Communicating expectations +### PP_Tip 69: Communicating expectations - Users come to you with some vision of what they want - It may be @@ -1639,24 +1639,24 @@ - Impossible - They are invested in it: you cannot ignore this -## Manage expectations +### Manage expectations - Work with your users so that they understand what you are delivering - Never lose sight of the business problems your application is intended to solve -## Go the extra mile +### Go the extra mile - Surprise and delight your users - E.g., balloon help, colorization, automatic installation, splash screen customized for their organization, ... -## PP_Tip 70: Sign your work +### PP_Tip 70: Sign your work - Craftsmen of earlier ages were proud to sign their work - Your signature should come to be recognized as an indicator of quality -## PP_Tip 70: Code ownership vs anonymity +### PP_Tip 70: Code ownership vs anonymity - Code ownership can cause cooperation problems: people become territorial - Anonymity can enable sloppiness, laziness diff --git a/docs/coding/all.code_review.how_to_guide.md b/docs/coding/all.code_review.how_to_guide.md index f7846345ed..ed93506917 100644 --- a/docs/coding/all.code_review.how_to_guide.md +++ b/docs/coding/all.code_review.how_to_guide.md @@ -1,4 +1,7 @@ -# Code review +# Code Review + +## Code review + - [General rules about code review](#general-rules-about-code-review) @@ -34,9 +37,9 @@ -# General rules about code review +## General rules about code review -## Read the Google code review best practices +### Read the Google code review best practices - From the [developer\'s perspective](https://google.github.io/eng-practices/review/developer) @@ -52,9 +55,9 @@ - Understand the rationale -# Code review workflows +## Code review workflows -## Pull request +### Pull request - Our usual review process is to work in a branch and create a pull request - See the @@ -63,16 +66,16 @@ - The name of the pull request is generated with ghi_show.py and looks like PTask2704 make exchange contracts get contracts applicable to series -# From the code author point of view +## From the code author point of view -## Why we review code +### Why we review code - We spend time reviewing each other code so that we can: - Build a better product, by letting other people look for bugs - Propagate knowledge of the code base through the team - Learn from each other -## PR checklist +### PR checklist - From [Google reviewer checklist](https://google.github.io/eng-practices/review/reviewer/looking-for.html): @@ -90,7 +93,7 @@ - Code is appropriately documented. - The code conforms to our style guides. -## The golden rule of code review +### The golden rule of code review - Make life easy for the reviewers - Aka "Do not upset the reviewers, otherwise they won't let you merge your @@ -102,7 +105,7 @@ - If you are in doubt "it's probably clear, although I am not 100% sure", err on giving more information and answer potential questions -## Be clear in the PR request about what you want +### Be clear in the PR request about what you want - Summarize what was done in the PR - Refer to the GH task, but the task alone might not be sufficient @@ -110,7 +113,8 @@ - Which part is it implementing? - Why is it doing it in a certain way? -- If the code is not ready for merge, but you want a "pre-review" convert PR to a draft +- If the code is not ready for merge, but you want a "pre-review" convert PR to + a draft - E.g., ask for an architectural review - Draft PRs can not be merged @@ -120,7 +124,7 @@ carefully - If it\'s blocking a ping on IM is a good idea -## Do not mix changes and refactoring / shuffling code +### Do not mix changes and refactoring / shuffling code - The job of the reviewers become frustrating when the author mixes: - Refactoring / moving code; and @@ -151,14 +155,14 @@ - Merge `TaskXYZ_do_this_and_that` to master - Merge `master` back into `gp_scratch` and keep moving -## Double check before sending a PR +### Double check before sending a PR - After creating a PR take a look at it to make sure things look good, e.g., - Are there merge problems? - Did you forget some file? - Skim through the PR to make sure that people can understand what you changed -## Reviewing other people's code is usually not fun +### Reviewing other people's code is usually not fun - Reviewing code is time-consuming and tedious - So do everything you can to make the reviewer's job easier @@ -170,7 +174,7 @@ - Readability is paramount - You should abhor write-only code -## The first reviews are painful +### The first reviews are painful - One needs to work on the same code over and over - Just think about the fact that the reviewer is also reading (still crappy) @@ -179,13 +183,14 @@ - Unfortunately it is needed pain to get to the quality of code we need to make progress as a team -## Apply review comments everywhere +### Apply review comments everywhere - Apply a review comment everywhere, not just where the reviewer pointed out the issue - E.g., reviewer says: - - "Please replace `_LOG.warning("Hello %s".format(name))` with `_LOG.warning("Hello %s", name)`" + - "Please replace `_LOG.warning("Hello %s".format(name))` with + `_LOG.warning("Hello %s", name)`" - You are expected to do this replacement: - In the current review - In all future code you write @@ -193,11 +198,11 @@ - Of course don't start modifying the old code in this review, but open a clean-up bug, if you need a reminder -## Look at the code top-to-bottom +### Look at the code top-to-bottom - E.g., if you do a search & replace, make sure everything is fine -## Answering comments after a review +### Answering comments after a review - It's better to answer comments in chunks so we don't get an email per comment - Use "start a review" (not in conversation) @@ -205,7 +210,7 @@ send it as single comment - When you answer a comment, mark it as resolved -## Apply changes to a review quickly +### Apply changes to a review quickly - In the same way the reviewers are expected to review PRs within 24 hours, the author of a PR is expected to apply the requested changes quickly, ideally in @@ -223,37 +228,39 @@ fix the problems and then open a PR with new code - Other people that rely on your code are blocked -## Ask for another review +### Ask for another review - Once you are done with resolving all the comments ask for another review -## Workflow of a review in terms of GH labels +### Workflow of a review in terms of GH labels - The current meaning of the labels are: - - See GitHub ZenHub workflows [doc](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md#pr-labels) + - See GitHub ZenHub workflows + [doc](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md#pr-labels) -## Link PR to GH issue +### Link PR to GH issue - Mention the corresponding issue in the PR description to ease the navigation -E.g., see an [example](https://github.com/kaizen-ai/kaizenflow/pull/288#issue-1729654983) + E.g., see an + [example](https://github.com/kaizen-ai/kaizenflow/pull/288#issue-1729654983) -## Fix later +### Fix later - It's ok for an author to file a follow up Issue (e.g., with a clean up), by pointing the new Issue to the comments to address, and move on with merge - The Issue needs to be addressed immediately after -# From the code reviewer point of view +## From the code reviewer point of view -## Post-commit review +### Post-commit review - You can comment on a PR already merged - You can comment on the relevant lines in a commit straight to `master` (this is the exception) -## Code walk-through +### Code walk-through - It is best to create a branch with the files you want to review - Add TODOs in the code (so that the PR will pick up those sections) @@ -262,21 +269,21 @@ E.g., see an [example](https://github.com/kaizen-ai/kaizenflow/pull/288#issue-17 - Try to get a top to bottom review of a component once every N weeks (N = 2, 3) - Sometimes the structure of the -## Close the PR and delete the branch +### Close the PR and delete the branch - When code is merged into master by one of the reviewers through the UI one can select the delete branch option -- Otherwise you can delete the branch using the procedure in -[Git](https://docs.google.com/document/u/0/d/1zahC8uDnFGYRSgkBrQRgg3W3ZmDjZZJj6yln6YeuHq4/edit) +- Otherwise you can delete the branch using the procedure in + [Git](https://docs.google.com/document/u/0/d/1zahC8uDnFGYRSgkBrQRgg3W3ZmDjZZJj6yln6YeuHq4/edit) -## Give priority to code review +### Give priority to code review - We target to give feedback on a PR within 24hr so that the author is not blocked for too long - Usually we respond in few hours -## Multiple reviewers problem +### Multiple reviewers problem - When there are multiple reviewers for the same PR there can be some problem @@ -287,11 +294,11 @@ E.g., see an [example](https://github.com/kaizen-ai/kaizenflow/pull/288#issue-17 - The other can catch up with post-commit review - A good approach is to monitor recently merged PRs in GH to catch up -## Remember "small steps ahead" +### Remember "small steps ahead" - Follow the Google approach of merging a PR that is a strict improvement. -## Nothing is too small +### Nothing is too small - Each reviewer reviews the code pointing out everything that can be a problem @@ -302,7 +309,7 @@ E.g., see an [example](https://github.com/kaizen-ai/kaizenflow/pull/288#issue-17 their own stylistic preference, this should not be pointed, unless it's a matter of consistency or leave the choice to the author -## Final GH comment +### Final GH comment - Once you are done with the detailed review of the code, you need to - Write a short comment diff --git a/docs/coding/all.coding_style.how_to_guide.md b/docs/coding/all.coding_style.how_to_guide.md index eb0281c115..48bf32860f 100644 --- a/docs/coding/all.coding_style.how_to_guide.md +++ b/docs/coding/all.coding_style.how_to_guide.md @@ -1,4 +1,6 @@ -# KaizenFlow - Python Style Guide +# Coding Style + +## KaizenFlow - Python Style Guide @@ -139,7 +141,7 @@ -# Meta +## Meta - What we call "rules" are actually just a convention - The "rules" @@ -159,7 +161,7 @@ - Rules are not fixed in stone - Rules evolve based on what we discuss through the reviews -# Disclaimer +## Disclaimer - This document was forked from [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html), @@ -173,12 +175,12 @@ (software developer, DevOps or data scientist) should abide. Read it on long commutes, during lunch, and treat yourself to a physical copy on Christmas. The book is summarized - [here](/docs/coding/all.code_like_pragmatic_programmer.how_to_guide.md), + [here](https://github.com/cryptokaizen/cmamp/blob/master/docs/coding/all.code_like_pragmatic_programmer.how_to_guide.md), but do not deprive yourself of the engaging manner in which Thomas & Hunt elaborate on these points -- on top of it all, it is a very, very enjoyable read. -## References +### References - Coding - [Google Python Style Guide (GPSG)](https://google.github.io/styleguide/pyguide.html) @@ -193,7 +195,7 @@ - [Google philosophical stuff](https://github.com/google/styleguide/blob/gh-pages/docguide/philosophy.md) - [Unix rules (although a bit cryptic sometimes)](https://en.wikipedia.org/wiki/Unix_philosophy#Eric_Raymond%E2%80%99s_17_Unix_Rules) -# High-Level Principles +## High-Level Principles - In this paragraph we summarize the high-level principles that we follow for designing and implementing code and research. We should be careful in adding @@ -201,9 +203,9 @@ all the other lower level principles we follow (like a basis for a vector space) -### Follow the [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) principle +#### Follow the [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) principle -### The writer is the reader +#### The writer is the reader - Make code easy to read even if it is more difficult to write - Code is written 1x and read 100x @@ -213,20 +215,20 @@ - So make your future-self's life easier by following the conventions and erring on the side of documenting for the reader. -### Encapsulate what changes +#### Encapsulate what changes - Separate what changes from what stays the same -### [Least surprise principle](https://en.wikipedia.org/wiki/Principle_of_least_astonishment) +#### [Least surprise principle](https://en.wikipedia.org/wiki/Principle_of_least_astonishment) - Try to make sure that the reader is not surprised -### Pay the technical debt +#### Pay the technical debt - Any unpaid debt is guaranteed to bite you when you don't expect it - Still some debt is inevitable: try to find the right trade-off -### End-to-end first +#### End-to-end first - Always focus on implementing things end-to-end, then improve each block - Remember the analogy of building the car through the skateboard, the bike, @@ -234,18 +236,18 @@ - Compare this approach to building wheels, chassis, with a big-bang integration at the end -### Unit test everything +#### Unit test everything - Code that matters needs to be unit tested - Code that doesn't matter should not be checked in the repo - The logical implication is: all code checked in the repo should be unit tested -### Don't get attached to code +#### Don't get attached to code - It's ok to delete, discard, retire code that is not useful any more - Don't take it personally when people suggest changes or simplification -### Always plan before writing code +#### Always plan before writing code - File a GitHub issue - Think about what to do and how to do it @@ -253,13 +255,13 @@ - The best code is the one that we avoid to write through a clever mental kung-fu move -### Think hard about naming +#### Think hard about naming - Finding a name for a code object, notebook, is extremely difficult but very important to build a mental map - Spend the needed time on it -### Look for inconsistencies +#### Look for inconsistencies - Stop for a second after you have, before sending out: - Implemented code or a notebook @@ -273,7 +275,7 @@ - Do you see inconsistencies, potential issues? - It will take less and less time to become good at this -### No ugly hacks +#### No ugly hacks - We don't tolerate "ugly hacks", i.e., hacks that require lots of work to be undone (much more than the effort to do it right in the first place) @@ -281,11 +283,11 @@ dependency between distant pieces of code - Ugly hacks spreads everywhere in the code base -# Our coding suggestions +## Our coding suggestions -## Being careful with naming +### Being careful with naming -### Follow the conventions +#### Follow the conventions - Name executable files (scripts) and library functions using verbs (e.g., `download.py`, `download_data()`) @@ -301,7 +303,7 @@ … ``` -### Follow spelling rules +#### Follow spelling rules - We spell commands in lower-case, and programs with initial upper case: - "Git" (as program), "git" (as the command) @@ -323,9 +325,9 @@ global - Profit-and-loss: PnL instead of pnl or PNL -### Search good names, avoid bad names +#### Search good names, avoid bad names -#### General naming rules +##### General naming rules - Naming things properly is one of the most difficult task of a programmer / data scientist @@ -353,7 +355,7 @@ - The name should capture what the data structure represents (its semantics) and not how it is implemented -#### Do not be stingy +##### Do not be stingy - Why calling an object `TimeSeriesMinStudy` instead of `TimeSeriesMinuteStudy`? - Saving 3 letters is not worth @@ -362,7 +364,7 @@ - If you don't like to type, we suggest you get a better keyboard, e.g., [this](https://kinesis-ergo.com/shop/advantage2/) -#### Do not abbreviate just to save characters +##### Do not abbreviate just to save characters - Abbreviations just to save space are rarely beneficial to the reader. E.g., - Fwd (forward) @@ -370,7 +372,7 @@ - Act (actual) - Exp (expected) -#### When to use abbreviations +##### When to use abbreviations - We could relax this rule for short lived functions and variables in order to save some visual noise. @@ -387,7 +389,7 @@ - Col (column) - Vol (volatility) while volume is always spelled out -### Avoid code stutter +#### Avoid code stutter - An example of code stutter: you want to add a function that returns `git` root path in a module `git` @@ -411,9 +413,9 @@ ``` - This is not only aesthetic reason but a bit related to a weak form of DRY -## Comments and docstrings +### Comments and docstrings -### General conventions +#### General conventions - Code needs to be properly commented - We follow python standard [PEP 257](https://www.python.org/dev/peps/pep-0257/) @@ -427,7 +429,7 @@ - Epytext - Numpydoc -### Descriptive vs imperative style +#### Descriptive vs imperative style - We decided to use imperative style for our comments and docstrings - Pylint and other python QA tools favor an imperative style @@ -438,7 +440,7 @@ e.g. don't write "Returns the pathname ...". ``` -### Docstrings style +#### Docstrings style - We follow ReST (aka re-Structured Text) style for docstrings which is: - The most widely supported in the python community @@ -496,7 +498,7 @@ ``` - [More examples of and discussions on python docstrings](https://stackoverflow.com/questions/3898572) -### Comments style +#### Comments style - Comments follow the same style of docstrings, e.g., imperative style with period `.` at the end @@ -544,7 +546,7 @@ - If you want to separate an `if` statement from a bunch of code preceeding it, you can leave an empty comment before it -### Replace empty lines in code with comments +#### Replace empty lines in code with comments - The problem with empty lines is that they are visually confusing since one empty line is used also to separate functions. For this reason we suggest @@ -580,7 +582,7 @@ ... ``` -### Comment chunk of codes +#### Comment chunk of codes - Avoid wall-of-code, by commenting chunks of code that perform a cohesive work @@ -633,7 +635,7 @@ config.save_to_file(prod_dir, config_tag) ``` -### Referring to an object in code comments +#### Referring to an object in code comments - In general, **avoid** this whenever possible - Code object names (e.g., function, class, params) are often subject to change, @@ -659,7 +661,7 @@ # The dataframe `df_tmp` is used for ... ``` -### Avoid distracting comments +#### Avoid distracting comments - Use comments to explain the high level logic / goal of a piece of code and not the details, e.g., do not comment things that are obvious @@ -669,7 +671,7 @@ _LOG.info("Results are %s", ...) ``` -### Commenting out code +#### Commenting out code - When we comment out code, we should explain why it is no longer relevant - _Bad_ @@ -684,7 +686,7 @@ is_alive = pd.Series(True, index=metadata.index) ``` -### Use type hints +#### Use type hints - We expect new code to use type hints whenever possible - See [PEP 484](https://www.python.org/dev/peps/pep-0484/) @@ -693,12 +695,12 @@ - We plan to start using static analyzers (e.g., `mypy`) to check for bugs from type mistakes and to enforce type hints at run-time, whenever possible -### Interval notation +#### Interval notation - Intervals are represented with `[a, b), (a, b], (a, b), [a, b]` - We don't use the other style `[a, b[` -### If you find a bug or obsolete docstring/TODO in the code +#### If you find a bug or obsolete docstring/TODO in the code - The process is: - Do a `git blame` to find who wrote the code @@ -709,14 +711,14 @@ - How to reproduce it, ideally a unit test - Stacktrace -## Linter +### Linter - The linter is in charge of reformatting the code according to our conventions and reporting potential problems - You can find instructions on how to run linter at the - [First review process](/docs/coding/all.submit_code_for_review.how_to_guide.md) doc + [First review process](First_review_process.md) doc -### Remove linter messages +#### Remove linter messages - When the linter reports a problem: - We assume that linter messages are correct, until the linter is proven wrong @@ -732,7 +734,7 @@ infer, then you should question whether that behavior is really needed - A human reader would probably be as confused as the linter is -### When to disable linter messages +#### When to disable linter messages - If you really believe you should override the linter in this particular case, then use something like: @@ -756,7 +758,7 @@ import config.logging_settings ``` -### Prefer non-inlined linter comments +#### Prefer non-inlined linter comments - As for the general comments, we prefer make linter comments non-inlined - However, sometimes there is no other choice than an inlined comment to get the @@ -778,7 +780,7 @@ # pylint: enable=line-too-long ``` -### Don't mix real changes with linter changes +#### Don't mix real changes with linter changes - We don't commit changes that modify the code together with linter reformatting, unless the linting is applied to the changes we just made @@ -806,13 +808,13 @@ - You can make this change directly on `master` or do a PR if you want to be extra sure: your call -## Logging +### Logging -### Always use logging instead of prints +#### Always use logging instead of prints - Always use `logging` and never `print()` to monitor the execution -### Our logging idiom +#### Our logging idiom - In order to use our logging framework (e.g., `-v` from command lines, and much more) use: @@ -832,7 +834,7 @@ - E.g., when there is a bug one can run with `-v DEBUG` and see what's happening right before the bug -### Logging level +#### Logging level - Use `_LOG.warning` for messages to the final user related to something unexpected where the code is making a decision that might be controversial @@ -852,7 +854,7 @@ sometimes this happens silently and it is reported only from the OS return code -### How to pick the level for a logging statement +#### How to pick the level for a logging statement - If all the debug info was printed at `INFO` level, the output will be too slow by default @@ -866,7 +868,7 @@ - So in prod mode you need to know which part you want to debug, since printing everything at `INFO` level is not possible -### Use positional args when logging +#### Use positional args when logging - _Bad_ ```python @@ -882,7 +884,7 @@ - The reason is that in the second case the string is not built unless the logging is actually performed, which limits time overhead from logging -### Exceptions don't allow positional args +#### Exceptions don't allow positional args - For some reason people tend to believe that using the `logging` / `dassert` approach of positional param to exceptions @@ -904,7 +906,7 @@ - There is little time overhead since if you get to the exception probably the code is going to terminate, and it's not in a hot loop -### Report warnings +#### Report warnings - If there is a something that is suspicious but you don't feel like it's worthwhile to assert, report a warning with: @@ -917,9 +919,9 @@ - Send the rest to warnings.log - At the end of the run, reports "there are warnings in warnings.log" -## Assertions +### Assertions -### Validate values before an assignment +#### Validate values before an assignment - We consider this as an extension of a pre-condition ("only assign values that are correct") rather than a postcondition @@ -944,7 +946,7 @@ hdbg.dassert_isinstance(col_rename_func, collections.Callable) ``` -### Encode the assumptions using assertions +#### Encode the assumptions using assertions - If your code makes an assumption don’t just write a comment, but implement an assertion so the code can’t be executed if the assertion is not verified @@ -953,7 +955,7 @@ hdbg.dassert_lt(start_date, end_date) ``` -### Use positional args when asserting +#### Use positional args when asserting - `dassert_*` is modeled after logging so for the same reasons one should use positional args @@ -966,7 +968,7 @@ hdbg.dassert_eq(a, 1, "No info for %s", method) ``` -### Report as much information as possible in an assertion +#### Report as much information as possible in an assertion - When using a `dassert_*` you want to give to the user as much information as possible to fix the problem @@ -987,9 +989,9 @@ pesky spaces that make the value unclear, or to make the error as readable as possible -## Imports +### Imports -### Don't use evil `import *` +#### Don't use evil `import *` - Do not use in notebooks or code the evil `import *` - _Bad_ @@ -1007,7 +1009,7 @@ with the namespace - [Is evil in many other ways](https://stackoverflow.com/questions/2386714/why-is-import-bad) -### Cleaning up the evil `import *` +#### Cleaning up the evil `import *` - To clean up the mess you can: - For notebooks @@ -1019,7 +1021,7 @@ to tweak the path of symbols exported by a library - This is an advanced topic and you should rarely use it -### Avoid `from ... import ...` +#### Avoid `from ... import ...` - Import should always start from `import`: ```python @@ -1052,7 +1054,7 @@ - `np.read_documents()` at least gives information of which packages is it coming from and enables us to track it down to the code -### Exceptions to the import style +#### Exceptions to the import style - We try to minimize the exceptions to this rule to avoid to keep this rule simple, rather than discussing about @@ -1064,7 +1066,7 @@ in order to avoid typing everywhere, since we want to use type hints as much as possible -### Always import with a full path from the root of the repo / submodule +#### Always import with a full path from the root of the repo / submodule - _Bad_ ```python @@ -1076,7 +1078,7 @@ ``` - In this way your code can run without depending upon your current dir -### Baptizing module import +#### Baptizing module import - Each module that can be imported should have a docstring at the very beginning (before any code) describing how it should be imported @@ -1098,7 +1100,7 @@ - The goal is to have always the same imports so it's easy to move code around, without collisions -### Examples of imports +#### Examples of imports - Example 1 - _Bad_ @@ -1128,9 +1130,9 @@ import helpers.hdbg as hdbg ``` -## Scripts +### Scripts -### Use Python and not bash for scripting +#### Use Python and not bash for scripting - We prefer to use python instead of bash scripts with very few exceptions - E.g., scripts that need to modify the environment by setting env vars, like @@ -1148,7 +1150,7 @@ - Our approach is to make simple to create scripts in python that are equivalent to sequencing shell commands, so that can evolve in complex scripts -### Skeleton for a script +#### Skeleton for a script - The ingredients are: - `dev_scripts/script_skeleton.py`: a template to write simple scripts you can @@ -1162,7 +1164,7 @@ - A simple example is: `dev_scripts/git/gup.py` - A complex example is: `dev_scripts/replace_text.py` -### Some useful patterns +#### Some useful patterns - Some useful patterns / idioms that are supported by the framework are: - Incremental mode: you skip an action if its outcome is already present @@ -1171,7 +1173,7 @@ - Non-incremental mode: clean and execute everything from scratch - Dry-run mode: the commands are written to screen instead of being executed -### Use scripts and not notebooks for long-running jobs +#### Use scripts and not notebooks for long-running jobs - We prefer to use scripts to execute code that might take long time (e.g., hours) to run, instead of notebooks @@ -1187,7 +1189,7 @@ - You can experiment with notebooks, move the code into a library, and wrap it in a script -### Follow the same structure +#### Follow the same structure - All python scripts that are meant to be executed directly should: @@ -1209,7 +1211,7 @@ ``` 4. Ideally use `argparse` to have a minimum of customization -### Use clear names for the scripts +#### Use clear names for the scripts - In general scripts (like functions) should have a name like "action_verb". - _Bad_ @@ -1228,9 +1230,9 @@ `TaskXYZ_edgar_timestamp_dataset_extractor.py` - Also where the script is located should give some clue of what is related to -## Functions +### Functions -### Avoid using non-exclusive `bool` arguments +#### Avoid using non-exclusive `bool` arguments - While a simple `True`/`False` switch may suffice for today's needs, very often more flexibility is eventually needed @@ -1246,7 +1248,7 @@ parameter `None`. This is only a good route if the default operation is non-controversial / intuitively obvious. -### Try to make functions work on multiple types +#### Try to make functions work on multiple types - We encourage implementing functions that can work on multiple related types: - _Bad_: implement `demean_series()`, `demean_dataframe()` @@ -1259,7 +1261,7 @@ - Try to return the same type of the input, if possible - E.g., the function called on a `pd.Series` returns a `pd.Series` -### Avoid hard-wired column name dependencies +#### Avoid hard-wired column name dependencies - When working with dataframes, we often want need handle certain columns differently, or perform an operation on a strict subset of columns @@ -1294,7 +1296,7 @@ - This prevents hidden column name dependencies from spreading like a virus throughout the codebase -### Single exit point from a function +#### Single exit point from a function - Consider the following _Bad_ function ```python @@ -1350,14 +1352,14 @@ return ``` -### Order of function parameters +#### Order of function parameters -#### Problem +##### Problem - We want to have a standard, simple, and logical order for specifying the arguments of a function -#### Decision +##### Decision - The preferred order is: - Input parameters @@ -1365,7 +1367,7 @@ - In-out parameters - Default parameters -### Consistency of ordering of function parameters +#### Consistency of ordering of function parameters - Try to: - Keep related variables close to each other @@ -1376,14 +1378,14 @@ - Use linter to check consistency of types between function definition and invocation -### Style for default parameter +#### Style for default parameter -#### Problem +##### Problem - How to assign default parameters in a function to make them clear and distinguishable? -#### Decision +##### Decision - We make all the default parameters keyword-only - This means that we should always specify default parameters using a keyword @@ -1440,7 +1442,7 @@ function1(..., dir_name=dir_name) ``` -#### Rationale +##### Rationale - Pros of the _Good_ vs _Bad_ style - When you wrap multiple functions, each function needs to propagate the @@ -1458,9 +1460,9 @@ - Cons: - One needs to add `Optional` to the type hint -### Calling functions with default parameters +#### Calling functions with default parameters -#### Problem +##### Problem - You have a function ```python @@ -1474,7 +1476,7 @@ ``` - How should it be invoked? -#### Decision +##### Decision - We prefer to - Assign directly the positional parameters @@ -1493,7 +1495,7 @@ func(task_name, dataset_dir, clobber=clobber) ``` -#### Rationale +##### Rationale - Pros of _Good_ vs _Bad_ style - If a new parameter with a default value is added to the function `func` @@ -1509,9 +1511,9 @@ - Cons: - None -### Don't repeat non-default parameters +#### Don't repeat non-default parameters -#### Problem +##### Problem - Given a function with the following interface: ```python @@ -1520,7 +1522,7 @@ ``` how to invoke it? -#### Decision +##### Decision - Positional arguments are not default, so not keyword-only for consistency - _Bad_ @@ -1540,7 +1542,7 @@ mult_and_sum(a, b, c) ``` -#### Rationale +##### Rationale - Pros of _Good_ vs _Bad_ - Non-default parameters in Python require all the successive parameters to be @@ -1570,9 +1572,9 @@ mult_and_sum(mul1, mul2, sum_) ``` -## Writing clear beautiful code +### Writing clear beautiful code -### Keep related code close +#### Keep related code close - E.g., keep code that computes data close to the code that uses it. - This holds also for notebooks: do not compute all the data structure and then @@ -1581,7 +1583,7 @@ “processes it”. In this way it’s easier to see “blocks” of code that are dependent from each other, and run only a cluster of cells. -### Order functions in topological order +#### Order functions in topological order - Order functions / classes in topological order so that the ones at the top of the files are the "innermost" and the ones at the end of the files are the @@ -1591,7 +1593,7 @@ - Linter reorders functions and classes in the topological order so make sure you run it after adding new ones -### Distinguish public and private functions +#### Distinguish public and private functions - The public functions `foo_bar()` (not starting with `_`) are the ones that make up the interface of a module and that are called from other modules and @@ -1603,7 +1605,7 @@ - Some references: - [StackOverflow](https://stackoverflow.com/questions/1641219/does-python-have-private-variables-in-classes?noredirect=1&lq=1) -### Keep public functions organized in a logical order +#### Keep public functions organized in a logical order - Keep the public functions in an order related to the use representing the typical flow of use, e.g., @@ -1659,7 +1661,7 @@ - IMO the worst issue is that they don’t play super-well with Jupyter autoreload -### Do not make tiny wrappers +#### Do not make tiny wrappers - Examples of horrible functions: - How many characters do we really saved? If typing is a problem, learn to @@ -1681,7 +1683,7 @@ return f_name ``` -### Regex +#### Regex - The rule of thumb is to compile a regex expression, e.g., ```python @@ -1690,7 +1692,7 @@ only if it's called more than once, otherwise the overhead of compilation and creating another var is not justified -### Do not introduce another “concept” unless really needed +#### Do not introduce another “concept” unless really needed - We want to introduce degrees of freedom and indirection only when we think this can be useful to make the code easy to maintain, read, and expand. @@ -1725,7 +1727,7 @@ ``` then the variable and its value are in contrast. -### Return `None` or keep one type +#### Return `None` or keep one type - Functions that return different types can make things complicated downstream, since the callers need to be aware of all of it and handle different cases. @@ -1752,7 +1754,7 @@ df["Name"] = df["Tags"].apply(extract_name) ``` -### Avoid wall-of-text functions +#### Avoid wall-of-text functions - _Bad_ ```python @@ -1812,9 +1814,9 @@ - You should at least split the functions in chunks using `#` or even better comment what each chunk of code does. -## Writing robust code +### Writing robust code -### Don’t let your functions catch the default-itis +#### Don’t let your functions catch the default-itis - Default-itis is a disease of a function that manifests itself by getting too many default parameters. @@ -1826,7 +1828,7 @@ Resist this urge! `grep` is friend. Pycharm does this refactoring automatically. -### Explicitly bind default parameters +#### Explicitly bind default parameters - It’s best to explicitly bind functions with the default params so that if the function signature changes, your functions doesn’t confuse a default param was @@ -1846,7 +1848,7 @@ ) ``` -### Don’t hardwire params in a function call +#### Don’t hardwire params in a function call - _Bad_ ```python @@ -1866,7 +1868,7 @@ only needed params, which is as much as we can require from the called function. -### Make `if-elif-else` complete +#### Make `if-elif-else` complete - In general all the `if-elif-else` statements should to be complete, so that the code is robust. @@ -1902,7 +1904,7 @@ doesn't run for 1 hr and then crash because the name of the file is incorrect -### Add TODOs when needed +#### Add TODOs when needed - When there is something that you know you should have done, but you didn’t have time to do, add a TODO, possibly using your github name e.g., @@ -1940,9 +1942,9 @@ # TODO(Sergey): P1 This can be implemented in pandas using a range generation. ``` -## Common Python mistakes +### Common Python mistakes -### `==` vs `is` +#### `==` vs `is` - `is` checks whether two variables point to the same object (aka reference equality), while `==` checks if the two pointed objects are equivalent (value @@ -1968,7 +1970,7 @@ - For more info checks [here](https://stackoverflow.com/questions/132988/is-there-a-difference-between-and-is-in-python) -### `type()` vs `isinstance()` +#### `type()` vs `isinstance()` - `type(obj) == list` is worse since we want to test for reference equality (the type of object is a list) and not the type of obj is equivalent to a list. @@ -1987,7 +1989,7 @@ - For more info check [here](https://stackoverflow.com/questions/1549801/what-are-the-differences-between-type-and-isinstance) -## Unit tests +### Unit tests - Provide a minimal end-to-end unit testing (which creates a conda environment and then run a few unit tests) @@ -2006,9 +2008,9 @@ - For more information on our testing conventions and guidelines, see `docs/coding/all.unit_tests.how_to_guide.md` -## Refactoring +### Refactoring -### When moving / refactoring code +#### When moving / refactoring code - If you move files, refactor code, move functions around make sure that: - Code and notebook work (e.g., imports and caller of the functions) @@ -2032,7 +2034,7 @@ - Run notebooks (see [here](https://n-xovwktmtjsnaxyc2mwes2xu7pohqedmdm6zjw5q-0lu-script.googleusercontent.com/userCodeAppPanel#)) -### Write script for renamings +#### Write script for renamings - When you need to rename any code object that is being used in many files, use `dev_scripts/replace_text.py` to write a script that will implement your task @@ -2043,9 +2045,9 @@ - Commit the created script to the mentioned folder so then your team members can use it to implement renaming in other libs -## Architectural and design pattern +### Architectural and design pattern -### Research quality vs production quality +#### Research quality vs production quality - Code belonging to top level libraries (e.g., `//amp/core`, `//amp/helpers`) and production (e.g., `//.../db`, `vendors`) needs to meet high quality @@ -2071,7 +2073,7 @@ - We should be able to raise the quality of a piece of research code to production quality when that research goes into production -### Always separate what changes from what stays the same +#### Always separate what changes from what stays the same - In both main code and unit test it's not a good idea to repeat the same code - _Bad_ @@ -2128,7 +2130,7 @@ ``` - Yes, Version A is _Bad_ and Version B is _Good_ -### Organize scripts as pipelines +#### Organize scripts as pipelines - One can organize complex computations in stages of a pipeline - E.g., to parse EDGAR forms @@ -2152,7 +2154,7 @@ the pieces into a throw-away script where I hardwire the file names and so on -### Make filename unique +#### Make filename unique - _Problem_ - We have a lot of structure / boilerplate in our project around RH @@ -2185,7 +2187,7 @@ prefixes? - This seems to be an infrequent case -### Incremental behavior +#### Incremental behavior - Often we need to run the same code over and over - E.g., because the code fails on an unexpected point and then we need to @@ -2208,7 +2210,7 @@ - If output file exists and param `--force`, then report a log.warn and rewrite output file -### Run end-to-end +#### Run end-to-end - Try to run things end-to-end (and from scratch) so we can catch these unexpected issues and code defensively @@ -2218,7 +2220,7 @@ data and we have 10 years of data is going to take 120 hours (=5 days) to run on the entire data set -### Think about scalability +#### Think about scalability - Do experiments to try to understand if a code solution can scale to the dimension of the data we have to deal with @@ -2227,12 +2229,12 @@ - Remember that typically we need to run the same scripts multiple times (e.g., for debug and / or production) -### Use command line for reproducibility +#### Use command line for reproducibility - Try to pass params through command line options when possible - In this way a command line contains all the set-up to run an experiment -### Structure the code in terms of filters +#### Structure the code in terms of filters - Focus on build a set of "filters" split into different functions, rather than a monolithic flow @@ -2246,18 +2248,18 @@ 6. Patch up SQL (e.g., inserting missing TR codes and reporting them to us so we can check with TR) -## Code style for different languages +### Code style for different languages -### SQL +#### SQL - You can use the package https://github.com/andialbrecht/sqlparse to format SQL queries - There is also an on-line version of the same formatter at https://sqlformat.org -# Conventions (Addendum) +## Conventions (Addendum) -## Be patient +### Be patient - For some reason talking about conventions makes people defensive and uncomfortable, sometimes. @@ -2268,7 +2270,7 @@ - If you are unsure or indifferent to a choice, be flexible and let other persons that seem to be less flexible decide. -## Goal +### Goal - The goal of the conventions is to simplify our job by removing ambiguity - There is no right or wrong: that's why it's a convention and not a law of @@ -2286,7 +2288,7 @@ about them, and reviewers don't have to be distracted with pointing out the lints -## Keep the rules simple +### Keep the rules simple - E.g., assume that we accepted the following rules: - Git is capitalized if it refers to the tool and it's not capitalized when it @@ -2306,7 +2308,7 @@ - E.g., every name of tools or library is always capitalized - This is simple to remember and automatically enforce -## Allow turning off the automatic tools +### Allow turning off the automatic tools - We understand that tools can't always understand the context and the subtleties of human thoughts, and therefore they yield inevitably to false @@ -2318,7 +2320,7 @@ overriding the tool becomes a slippery slope for ignoring the rules. - Patience and flexibility is advised here. -## Make the spell-checker happy +### Make the spell-checker happy - The spell-checker is not always right: false positives are often very annoying - We prefer to find a way to make the spell-checker happy rather than argue that diff --git a/docs/coding/all.gsheet_into_pandas.how_to_guide.md b/docs/coding/all.gsheet_into_pandas.how_to_guide.md index 13dc673fa9..03b558cb0b 100644 --- a/docs/coding/all.gsheet_into_pandas.how_to_guide.md +++ b/docs/coding/all.gsheet_into_pandas.how_to_guide.md @@ -1,25 +1,26 @@ - +# Gsheet Into Pandas - [Connecting Google Sheets to Pandas](#connecting-google-sheets-to-pandas) - * [Installing gspread-pandas](#installing-gspread-pandas) - * [Configuring gspread-pandas](#configuring-gspread-pandas) - * [Using `gspread` on the server](#using-gspread-on-the-server) -- [Using gspread-pandas](#using-gspread-pandas) + * [Installing libraries](#installing-libraries) + * [Check installation](#check-installation) + * [Authentication](#authentication) + + [In short](#in-short) +- [Testing gspread-pandas](#testing-gspread-pandas) -# Connecting Google Sheets to Pandas +## Connecting Google Sheets to Pandas - There are two layers of the API - [gspread](https://docs.gspread.org/) - This allows to connect to Google Sheets API - [gspread-pandas](https://gspread-pandas.readthedocs.io) - - This allows to interact with Google Sheets through Pandas DataFrames, using - `gspread` + - This allows to interact with Google Sheets through Pandas DataFrames, + using `gspread` -## Installing libraries +### Installing libraries - The library should be automatically installed in the Dev container @@ -33,10 +34,11 @@ docker> sudo /bin/bash -c "(source /venv/bin/activate; pip install gspread)" ``` -## Check installation +### Check installation - To check that the library is installed - In a notebook + ```bash import gspread print(gspread.__version__) @@ -52,13 +54,12 @@ 5.10.0 ``` -## Authentication +### Authentication - It's best to access Google API using a "Service Account", which is used for a bots -- Since `gspread-pandas` leverages `gspread`, you can follow the instructions for - gspread - https://docs.gspread.org/en/v6.0.0/oauth2.html +- Since `gspread-pandas` leverages `gspread`, you can follow the instructions + for gspread https://docs.gspread.org/en/v6.0.0/oauth2.html - There are two ways to authenticate - OAuth Client ID @@ -66,9 +67,11 @@ - More details are in - `gspread`: https://docs.gspread.org/en/latest/oauth2.html - - `gspread-pandas`: https://gspread-pandas.readthedocs.io/en/latest/configuration.html + - `gspread-pandas`: + https://gspread-pandas.readthedocs.io/en/latest/configuration.html + +#### In short -### In short - Go to Google Developers Console and create a new project or select one you already have - E.g., name "gp-gspread", and ID "gp-gspread-426713" @@ -79,12 +82,13 @@ - Service account details - Service account name: gspread - Service account ID: gspread - - Email address: gspread@gp-gspread-426713.iam.gserviceaccount.com + - Email address: gspread@gp-gspread-426713.iam.gserviceaccount.com - Role: owner - Click on `gspread` - Keys -> Create new key -> JSON - A file is downloaded -> more ~/Downloads/gspread-gp-94afb83adb02.json + + > more ~/Downloads/gspread-gp-94afb83adb02.json ``` { "type": "service_account", @@ -99,6 +103,7 @@ "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/gp-gspread%40gspread-gp.iam.gserviceaccount.com", "universe_domain": "googleapis.com" } + ``` - Move the key in the right place ``` @@ -115,7 +120,7 @@ exception when trying to access this spreadsheet from your application or a script. -# Testing gspread-pandas +## Testing gspread-pandas - The notebook with the usage example is located at `amp/core/notebooks/gsheet_into_pandas_example.ipynb`. diff --git a/docs/coding/all.hplayback.how_to_guide.md b/docs/coding/all.hplayback.how_to_guide.md new file mode 100644 index 0000000000..2330831944 --- /dev/null +++ b/docs/coding/all.hplayback.how_to_guide.md @@ -0,0 +1,216 @@ + + +- [Playback](#playback) +* [Code and tests](#code-and-tests) +* [Using playback](#using-playback) + - [Quick start](#quick-start) + - [Example 1: testing `get_sum()`](#example-1-testing-get_sum) + - [Example 2: testing `_render_plantuml()` from `render_md.py`](#example-2-testing-_render_plantuml-from-render_mdpy) + + + +# Playback + +- `Playback` is a way to automatically generate a unit test for a given function + by capturing the inputs applied to the function by the external world + +- The working principle is: + 1. Instrument the target function `f()` to test with a `Playback` object or + with a decorator `@playback` + 2. Run the function `f()` using the external code to drive its inputs + - E.g., while the function is executed as part of a more complex system, or + in a notebook + 3. The playback framework: + - Captures the inputs and the output of the function `f()` + - Generates Python code to apply the stimuli to `f()` and to check its + output against the expected output + 4. Modify the code automatically generated by `Playback` to create handcrafted + unit tests + +## Code and tests + +- The code for `Playback` is located at `helpers/hplayback.py` +- Unit tests for `Playback` with useful usage examples are located at + `helpers/test/test_playback.py` + +## Using playback + +### Quick start + +- Given a function to test like: + +```python +def function_under_test(...) -> ...: + ... + + ... + res = ... + return res +``` + +```python +def function_under_test(...) -> ...: + import helpers.hplayback as hplayb + playback = hplayb.Playback("assert_equal") + + ... + + ... + + res = ... + code = playback.run(res) + print(code) + return res +``` + +### Example 1: testing `get_sum()` + +- Assume that we want unit test a function `get_sum()` + + ```python + def get_sum(a: List[int], b: List[int]) -> Any: + c = a + b + return c + ``` + +- Assume that typically `get_sum()` gets its inputs from a complex pipeline + + ```python + def complex_data_pipeline() -> Tuple[List[int], List[int]]: + # Incredibly complex pipeline generating: + a = [1, 2, 3] + b = [4, 5, 6] + return a, b + ``` + +- The function is called with: + + ```python + a, b = complex_data_pipeline() + c = get_sum(a, b) + ``` + +- We don't want to compute by hand the inputs `a, b`, but we can reuse + `complex_data_pipeline` to create a realistic workload for the function under + test + +- Instrument the code with `Playback`: + + ```python + import helpers.playback as hplayb + + def get_sum(a: List[int], b: List[int]) -> Any: + playback = hplayb.Playback("assert_equal") + c = a + b + code = playback.run(res) + print(code) + return c + ``` + +- Create the playback object + + ```python + playback = hplayb.Playback("assert_equal") + ``` + + which specifies: + - The unit test mode: "check_string" or "assert_equal" + - The function name that is being tested: in our case, "get_sum" + - The function parameters that were created earlier + +- Run it with: + + ```python + a, b = complex_data_pipeline() + c = get_sum(a, b) + ``` + +- Run the playback passing the expected outcome as a parameter + + ```python + code = playback.run(res) + ``` + +- The output `code` will contain a string with the unit test for `get_sum()` + + ```python + import helpers.unit_test as hut + + class TestGetSum(hut.TestCase): + def test1(self) -> None: + # Initialize function parameters. + a = [1, 2, 3] + b = [4, 5, 6] + # Get the actual function output. + act = get_sum(a, b) + # Create the expected function output. + exp = [1, 2, 3, 4, 5, 6] + # Check whether the expected value equals the actual value. + self.assertEqual(act, exp) + ``` + +### Example 2: testing `_render_plantuml()` from `render_md.py` + +- Copy real `im_architecture.md` to a test location + +- Add playback into the code: + + ```python + ... + import helpers.playback as hplayb + ... + def _render_plantuml( + in_txt: List[str], out_file: str, extension: str, dry_run: bool + ) -> List[str]: + # Generate test. + playback = hplayb.Playback("check_string") + print(prnt.frame(playback.run(None))) + ... + ... + ``` + +- Run `render_md.py -i im_architecture.md` + +- The following output is prompted: + + ```python + # Test created for __main__._render_plantuml + import helpers.unit_test as hut + import jsonpickle + import pandas as pd + + class TestRenderPlantuml(hut.TestCase): + def test1(self) -> None: + # Define input variables + in_txt = ["", ..., "", "> **GP:**: Not urgent", ""] + out_file = "im_architecture.md" + extension = "png" + dry_run = False + # Call function to test + act = _render_plantuml(in_txt=in_txt, out_file=out_file, extension=extension, dry_run=dry_run) + act = str(act) + # Check output + self.check_string(act) + ``` + +- `in_txt` value is too long to keep it in test - needed to be replaced with + previously generated file. Also some cosmetic changes are needed and code is + ready to paste to the existing test: + ```python + def test_render_plantuml_playback1(self) -> None: + """Test real usage for im_architecture.md.test""" + # Define input variables + file_name = "im_architecture.md.test" + in_file = os.path.join(self.get_input_dir(), file_name) + in_txt = io_.from_file(in_file).split("\n") + out_file = os.path.join(self.get_scratch_space(), file_name) + extension = "png" + dry_run = True + # Call function to test + act = rmd._render_plantuml( + in_txt=in_txt, out_file=out_file, extension=extension, dry_run=dry_run + ) + act = "\n".join(act) + # Check output + self.check_string(act) + ``` diff --git a/docs/coding/all.imports_and_packages.how_to_guide.md b/docs/coding/all.imports_and_packages.how_to_guide.md index a1b6e7225c..1637d531d1 100644 --- a/docs/coding/all.imports_and_packages.how_to_guide.md +++ b/docs/coding/all.imports_and_packages.how_to_guide.md @@ -1,4 +1,6 @@ -# Imports and packages +# Imports And Packages + +## Imports and packages @@ -14,10 +16,9 @@ - TODO(gp): Consolidate here any other rule from other gdoc -# Goals of packages +## Goals of packages - The goal of creating packages is to: - - Simplify the import from clients - Hide in which file the actual code is, so that we can reorganize the code without having to change all the client code @@ -36,7 +37,7 @@ dtfsysonod.ArmaGenerator(...) ``` -## Circular dependency (aka import cycle, import loop) +### Circular dependency (aka import cycle, import loop) - The simplest case of circular import is a situation when in lib `A` we have `import B`, and in lib B we have `import A` @@ -45,26 +46,21 @@ a couple of minutes, but it will provide the most reliable and thorough check for circular imports -## Rules for imports +### Rules for imports - We follow rules to avoid import loops: - - Code inside a package should import directly a file in the same package and not use the package - - E.g., `im_v2/common/data/client/data_frame_im_clients.py` - - Good ```python import im_v2.common.data.client.abstract_im_clients as imvcdcaimcl ``` - - Bad ```python import im_v2.common.data.client as icdc ``` - - Code from a package should import other packages, instead of importing directly the file - We don't allow any import loop that can be detected statically (i.e., by @@ -82,7 +78,6 @@ want to pay the overhead only if we get enough benefit from this - We specify a short import in the `__init__.py` file for a package manually because the linter cannot do it automatically yet - - We use the first letters to build a short import and try to keep it less than 8 chars long, e.g., `im_v2.talos.data.client` -> `itdcl` - We insert an import docstring in the `__init__.py` file manually and then @@ -94,7 +89,7 @@ import im_v2.talos.data.client as itdcl ``` -## How to import code from unit tests +### How to import code from unit tests - To avoid churning client code when code is moved among files, we allow unit tests to both: @@ -122,7 +117,7 @@ - Given that both explanations are valid, we allow both styles -### Common unit test code +#### Common unit test code - Unit tests should not import from each other - If there is common code, it should go in libraries inside or outside `test` @@ -132,7 +127,7 @@ - E.g., we use `test/foobar_test_case.py` or `test/foobar_utils.py` - In other terms, test files are always leaves of the import graph -# Package/lib hierarchy and cycle prevention +## Package/lib hierarchy and cycle prevention - Static import cycles can be detected by the invoke `lint_detect_cycles` - To prevent import cycles, we want to enforce that certain packages don't @@ -164,7 +159,7 @@ - Any time we can break a file into smaller pieces, we should do that since this helps control the dependencies -# Anatomy of a package +## Anatomy of a package - TODO(gp): Let's use `dataflow` as a running example - A package has a special `__init__.py` exporting public methods diff --git a/docs/coding/all.integrate_repos.how_to_guide.md b/docs/coding/all.integrate_repos.how_to_guide.md index 3f7acb21f3..db8d095a08 100644 --- a/docs/coding/all.integrate_repos.how_to_guide.md +++ b/docs/coding/all.integrate_repos.how_to_guide.md @@ -1,4 +1,6 @@ -# How to integrate repos +# Integrate Repos + +## How to integrate repos @@ -12,7 +14,7 @@ -# Concepts +## Concepts - We have two dirs storing two forks of the same repo - Files are touched (e.g., added, modified, deleted) in each forks @@ -29,7 +31,7 @@ 2. The last integration point for each fork, at which the repos are the same, or at least aligned -# Invariants for the integration workflows +## Invariants for the integration workflows - The user runs commands in an abs dir, e.g., `/Users/saggese/src/{amp1,cmamp1}` - The user refers in the command line to `dir_basename`, which is the basename @@ -38,9 +40,9 @@ - The `dst_dir_basename` is assumed to be parallel to the `src_dir_basename` - The dirs are then transformed in absolute dirs `abs_src_dir` -# Integration process +## Integration process -## Preparation +### Preparation - Pull master @@ -116,7 +118,7 @@ > vimdiff ~/src/{amp1,cmamp1}/tasks.py; diff_to_vimdiff.py --dir1 ~/src/amp1 --dir2 ~/src/cmamp1 --subdir helpers ``` -## Integration +### Integration - Create the integration branches: @@ -199,7 +201,7 @@ > rsync --delete -a -r {src_dir}/ {dst_dir}/ ``` -## Double-check the integration +### Double-check the integration - Check that the regressions are passing on GH @@ -234,7 +236,7 @@ > i git_branch_diff_with -t base ``` -## Run tests +### Run tests - Check `amp` / `cmamp` using GH actions: diff --git a/docs/coding/all.jupyter_notebook.how_to_guide.md b/docs/coding/all.jupyter_notebook.how_to_guide.md index e2ecad5a2b..28ce66b25b 100644 --- a/docs/coding/all.jupyter_notebook.how_to_guide.md +++ b/docs/coding/all.jupyter_notebook.how_to_guide.md @@ -1,4 +1,6 @@ -# Jupyter notebook best practices +# Jupyter Notebook + +## Jupyter notebook best practices @@ -50,7 +52,7 @@ -# When to use a Jupyter notebook +## When to use a Jupyter notebook - A notebook can be used for various goals: - Tutorial / gallery @@ -68,13 +70,12 @@ base - We might want to add unit tests for it -# General structure of a notebook +## General structure of a notebook -## Description +### Description - At the top of the notebook add a description section explaining a notebook's goal and what it does, e.g., - ``` # Description @@ -85,7 +86,7 @@ by selecting a cell and then at Jupyter interface do `Cell -> Cell Type -> Markdown` -## Imports +### Imports - Add a code section importing the needed libraries - Autoreload modules to keep Jupyter and local code updated in real-time @@ -94,7 +95,6 @@ - Local imports from our lib - It's better to put all the imports in one cell and separate different import types by 1 empty line, e.g.: - ``` # Imports @@ -117,12 +117,11 @@ - In this way executing one cell is enough to configure the notebook -## Configuration +### Configuration - You can configure the notebooks with some utils, logging, and report info on how the notebook was executed (e.g., Git commit, libs, etc.) by using the following cell: - ``` # Configure logger. hdbg.init_logger(verbosity=logging.INFO) @@ -161,7 +160,7 @@ statsmodels: 0.13.5 ``` -# Make the notebook flow clear +## Make the notebook flow clear - Each notebook needs to follow a clear and logical flow, e.g: - Load data @@ -184,11 +183,11 @@ big) - You can collapse the cells and don't scroll back and forth too much -# General best practices +## General best practices -## Update calls only for Master/Gallery notebooks +### Update calls only for Master/Gallery notebooks -### Convention: +#### Convention: - We do our best to update the calls in the Master/Gallery notebooks but we don't guarantee that the fix is correct @@ -196,14 +195,14 @@ and tweak the call to enforce the old behavior, or even not do anything if there are too many changes -### Rationale: +#### Rationale: - We have dozens of ad-hoc research notebooks - When a piece of code is updated (e.g., `ImClient`) the change should be propagated everywhere in the code base, including the notebooks - This results in excessive amount of maintenance work which we want to avoid -## Keep code that belongs together in one cell +### Keep code that belongs together in one cell - It's often useful to keep in a cell computation that needs to be always executed together @@ -213,7 +212,7 @@ once we are more confident that it works correctly we can merge it in a cell (or even better in a function) -## Write beautiful code, even in notebooks +### Write beautiful code, even in notebooks - Follow the conventions and suggestions for [Python code style](Coding_Style_Guide.md) @@ -222,17 +221,17 @@ - In our opinion it's just better to always do write robust and readable code: it doesn't buy much time to cut corners -## Show how data is transformed as you go +### Show how data is transformed as you go - Print a few lines of data structures (e.g., `df.head(3)`) so one can see how data is transformed through the cells -## Use keyboard shortcuts +### Use keyboard shortcuts - Learn the default keyboard shortcuts to edit efficiently - You can use the vim plug-in (see below) and become 3x more ninja -## Strive for simplicity +### Strive for simplicity - Always make the notebook easy to be understood and run by somebody else - Explain what happens @@ -240,7 +239,7 @@ - Use decent variable names - Comment the results, when possible / needed -## Dependencies among cells +### Dependencies among cells - Try to avoid dependencies between cells - Even better avoid any dependency between cells, e.g.: @@ -251,7 +250,7 @@ re-initialize the notebook - For the same reason group functions in one cell that you can easily re-execute -## Re-execute from scratch +### Re-execute from scratch - Once in a while (e.g., once a day) - Commit your changes @@ -261,7 +260,7 @@ state or dependency in the code - Before a commit (and definitively before a PR) do a clean run -## Add comments for complex cells +### Add comments for complex cells - When a cell is too long, explain in a comment what a cell does, e.g., ``` @@ -275,18 +274,18 @@ - Another approach is to factor out the code in functions with clear names and simplify the flow -## Do not cut & paste code +### Do not cut & paste code - Cutting + paste + modify is _NEVER_ a good idea - It takes more time to clean up cut & paste code than doing right in the first place - Just make a function out of the code and call it! -## Avoid "wall-of-code" cell +### Avoid "wall-of-code" cell - Obvious -## Avoid data biases +### Avoid data biases - Try to compute statistics on the entire data set so that results are representative and not dependent on a particular slice of the data @@ -294,19 +293,19 @@ - If it takes too long to compute the statistics on the entire data set, report the problem and we can think of how to speed it up -## Avoid hardwired constants +### Avoid hardwired constants - Don't use hardwired constants - Try to parametrize the code -## Explain where data is coming from +### Explain where data is coming from - If you are using data from a file (e.g., `/data/wd/RP_1yr_13_companies.pkl`), explain in a comment how the file was generated - Ideally report a command line to regenerate the data - The goal is for other people to be able to re-run the notebook from scratch -## Fix warnings +### Fix warnings - A notebook should run without warnings - Warnings can't be ignored since they indicate that the code is relying on a @@ -344,7 +343,7 @@ - If it's not obvious how to interpret or fix a warning file a bug, file a bug reporting clearly a repro case and the error message -## Make cells idempotent +### Make cells idempotent - Try to make a notebook cell able of being executed multiple times without changing its output value, e.g., @@ -378,7 +377,7 @@ df_without_1s = df[df["id"] != 1].copy() ``` -## Always look at the discarded data +### Always look at the discarded data - Filtering the data is a risky operation since once the data is dropped, nobody is going to go back and double check what exactly happened @@ -407,7 +406,7 @@ - Make absolutely sure you are not dropping important data - E.g., has the distribution of the data changed in the way you would expect? -## Use a progress bar +### Use a progress bar - Always use progress bars (even in notebooks) so that user can see how long it will take for a certain computation. @@ -417,7 +416,7 @@ from tqdm.autonotebook import tqdm ``` -# Notebooks and libraries +## Notebooks and libraries - It's ok to use functions in notebooks when building the analysis to leverage notebook interactivity @@ -431,7 +430,7 @@ - Otherwise, if you change a function in the lib, the notebook will not pull this change and use the old version of the function -## Pros +### Pros - The same notebook code can be used for different notebooks - E.g., the function to read the data from disk is an obvious example @@ -448,30 +447,30 @@ wants to run on the entire large dataset - The exploratory analysis can be moved towards modeling and then production -## Cons +### Cons - One have to scroll back and forth between notebook and the libraries to execute the cell with the functions and fix all the possible mistakes -# Recommendations for plots +## Recommendations for plots -## Use the proper y-scale +### Use the proper y-scale - If one value can vary from -1.0 to 1.0, force the y-scale between those limits so that the values are absolutes, unless this would squash the plot -## Make each plot self-explanatory +### Make each plot self-explanatory - Make sure that each plot has a descriptive title, x and y label - Explain the set-up of a plot / analysis - E.g., what is the universe of stocks used? What is the period of time? - Add this information also to the plots -## Avoid wall-of-text tables +### Avoid wall-of-text tables - Try to use plots summarizing the results besides the raw results in a table -## Use common axes to allow visual comparisons +### Use common axes to allow visual comparisons - Try to use same axes for multiple graphs when possible to allow visual comparison between graphs @@ -479,49 +478,49 @@ scales and add a plot with multiple graphs inside on the same axis (e.g., with y-log) -## Use the right plot +### Use the right plot - Pick the right type of graph to make your point - `pandas`, `seaborn`, `matplotlib` are your friends -# Useful plugins +## Useful plugins - You can access the extensions menu: - `Edit -> nbextensions config` - `http://localhost:XYZ/nbextensions/` -## Vim bindings +### Vim bindings - [VIM binding](https://github.com/lambdalisue/jupyter-vim-binding/wiki/Installation) will change your life -## Table of content +### Table of content - To see the entire logical flow of the notebook, when you use the headers properly -## ExecuteTime +### ExecuteTime - To see how long each cell takes to execute -## Spellchecker +### Spellchecker - To improve your English! -## AutoSaveTime +### AutoSaveTime - To save the code automatically every minute -## Notify +### Notify - Show a browser notification when kernel becomes idle -## Jupytext +### Jupytext - We use Jupytext as standard part of our development flow - See `docs/work_tools/all.jupytext.how_to_guide.md` -## Gspread +### Gspread - Allow to read g-sheets in Jupyter Notebook - First, one needs to configure Google API, just follow the instructions from diff --git a/docs/coding/all.profiling.how_to_guide.md b/docs/coding/all.profiling.how_to_guide.md index 9b25aff21c..db55cc77a7 100644 --- a/docs/coding/all.profiling.how_to_guide.md +++ b/docs/coding/all.profiling.how_to_guide.md @@ -1,5 +1,7 @@ # Profiling +## Profiling + - [Profiling end-to-end a command line](#profiling-end-to-end-a-command-line) @@ -20,7 +22,7 @@ -# Profiling end-to-end a command line +## Profiling end-to-end a command line - You can use the time-tested Linux `time` command to profile both time and memory @@ -54,7 +56,6 @@ ``` - Information about the spent time are: - ``` User time (seconds): 187.70 System time (seconds): 16.27 @@ -63,13 +64,13 @@ ``` - The relevant part is the following line representing the amount of resident - memory (which is ~13GB) ` Maximum resident set size (kbytes): 13083892 ` + memory (which is ~13GB) `Maximum resident set size (kbytes): 13083892` -# Profiling Python code from command line +## Profiling Python code from command line -## cProfile +### cProfile -### Install in a Docker container +#### Install in a Docker container - From `devops/docker_build/install_cprofile.sh` ```bash @@ -79,11 +80,11 @@ > pip install gprof2dot ``` -### How to use with workflow +#### How to use with workflow - There is a script that runs the flow `amp/dev_scripts/run_profiling.sh` -### How to use manually +#### How to use manually - You need to run the code first with profiling enabled to collect the profiling data in a binary file (often called `prof.bin`). @@ -92,6 +93,7 @@ > python -m cProfile -o prof.bin ${CMD} ``` - To profile a unit test you can run: + ```bash # Profile a unit test. > python -m cProfile -o profile edgar/forms8/test/test_edgar_utils.py @@ -106,9 +108,11 @@ > gprof2dot -n 10 -f pstats profile -l "*extract_tables_from_forms*" | dot -Tpng -o output.png ``` -How to read a graph: https://nesi.github.io/perf-training/python-scatter/profiling-cprofile -- gprof2dot has lots of interesting options to tweak the output, e.g., + How to read a graph: + https://nesi.github.io/perf-training/python-scatter/profiling-cprofile + +- Gprof2dot has lots of interesting options to tweak the output, e.g., ```bash > gprof2dot -h ... @@ -137,19 +141,16 @@ How to read a graph: https://nesi.github.io/perf-training/python-scatter/profili ... ``` -### process_prof.py +#### process_prof.py - You can use the script `dev_scripts/process_prof.py` to automate some tasks: + - Top-level statistics + - Plotting the call-graph + - Custom statics - - top-level statistics +### line_profiler - - plotting the call-graph - - - custom statics - -## line_profiler - -- cProfile allows to break down the execution time into function calls, while +- CProfile allows to break down the execution time into function calls, while kernprof allows to profile a function line by line. - GitHub: [ pip install line_profiler ``` -### How to use +#### How to use - Instrument the code to profile: @@ -172,7 +173,7 @@ How to read a graph: https://nesi.github.io/perf-training/python-scatter/profili import atexit def exit_handler(): - profiler.print_stats() + profiler.print_stats() atexit.register(exit_handler) @@ -189,7 +190,7 @@ How to read a graph: https://nesi.github.io/perf-training/python-scatter/profili Wrote profile results to run_process_forecasts.py.lprof ``` -## pytest-profiling +### pytest-profiling - Webpage: [https://pypi.org/project/pytest-profiling](https://pypi.org/project/pytest-profiling) @@ -199,19 +200,19 @@ How to read a graph: https://nesi.github.io/perf-training/python-scatter/profili > pip install pytest-profiling ``` -### How to use +#### How to use - ```bash - > pytest --profile ./amp/core/dataflow_model/test/test_pnl_simulator.py::TestPnlSimulator2::test_perf1 -s - ``` +```bash + > pytest --profile ./amp/core/dataflow_model/test/test_pnl_simulator.py::TestPnlSimulator2::test_perf1 -s +``` -# Profiling in a Jupyter notebook +## Profiling in a Jupyter notebook - You can find all of the examples below in action in the `amp/core/notebooks/time_memory_profiling_example.ipynb` [link](https://github.com/kaizen-ai/kaizenflow/blob/master/core/notebooks/time_memory_profiling_example.ipynb). -## Time profilers +### Time profilers - In a notebook, execute cell with `%time` cell-magic: ```python @@ -219,12 +220,13 @@ How to read a graph: https://nesi.github.io/perf-training/python-scatter/profili func() ``` -## By function +### By function - We prefer cProfile for profiling and gprof2dot for visualization. - The documentation does not state this, but `%prun` magic uses cProfile under the hood, so we can use it in the notebook instead + ```python # We can suppress output to the notebook by specifying "-q". %%prun -D tmp.pstats func() !gprof2dot -f pstats tmp.pstats | dot -Tpng -o output.png @@ -237,7 +239,7 @@ How to read a graph: https://nesi.github.io/perf-training/python-scatter/profili - If you open the output image in the new tab, you can zoom in and look at the graph in detail. -- gprof2dot supports thresholds that make output more readable: +- Gprof2dot supports thresholds that make output more readable: ```python !gprof2dot -n 5 -e 5 -f pstats tmp.pstats | dot -Tpng -o output.png @@ -247,7 +249,7 @@ How to read a graph: https://nesi.github.io/perf-training/python-scatter/profili - This will filter the output into something like this: -# Memory profilers +## Memory profilers - We prefer using [memory-profiler](https://github.com/pythonprofilers/memory_profiler). diff --git a/docs/coding/all.publish_notebook.how_to_guide.md b/docs/coding/all.publish_notebook.how_to_guide.md index ae9c4fb882..50a1d63f72 100644 --- a/docs/coding/all.publish_notebook.how_to_guide.md +++ b/docs/coding/all.publish_notebook.how_to_guide.md @@ -1,4 +1,4 @@ - +# Publishing a notebook diff --git a/docs/coding/all.reading_other_people_code.how_to_guide.md b/docs/coding/all.reading_other_people_code.how_to_guide.md index 8d877caf56..a335e38856 100644 --- a/docs/coding/all.reading_other_people_code.how_to_guide.md +++ b/docs/coding/all.reading_other_people_code.how_to_guide.md @@ -1,4 +1,4 @@ - +# Reading Other People Code @@ -17,7 +17,7 @@ -# Reading other people code +## Reading other people code - People don't like reading other people's code - Still reading existing code needs to be done @@ -27,9 +27,9 @@ actually learn and improve as a coder - E.g., writers read and study other writers' book to improve -# What not to do +## What not to do -## Rewrite coding +### Rewrite coding - You think "This code is a complete ugly mess. It needs to be rewritten" - The answer is: ABSOLUTELY NO! @@ -42,7 +42,7 @@ - In other terms, there is no reason to believe that you are going to do a better job than others did -## Incremental renovation +### Incremental renovation - The first thing that programmers want to do is to bulldoze the place flat and build something great @@ -54,7 +54,7 @@ - Adding unit tests - In reality, 99.9% of work is incremental -## It's harder to read code than to write it +### It's harder to read code than to write it - For this reason code reuse is hard - For this reason, everybody on the team has the same function to do the same @@ -62,7 +62,7 @@ - It's easier and more fun to write new code than figuring out how the old code works -## Respect old code! +### Respect old code! - When you think "the old code is a mess", you are probably wrong @@ -78,7 +78,7 @@ - When you throw away code and start from scratch, you are throwing away all the knowledge, all the bug fixes, all the hard thinking -## What makes code a mess? +### What makes code a mess? - What makes the code a "mess" (at least according to your expert opinion as world-class coder): @@ -99,9 +99,9 @@ - All these problems can be easily fixed in 100x less time than rewriting -# What to do +## What to do -## Get into the right attitude +### Get into the right attitude 1. Assume that whoever wrote the code knew what he/she was doing - If that's not the case, he/she would have already been fired from the team @@ -115,7 +115,7 @@ - There is no reason to believe that you can write the code in a simpler way - The complexity is almost always needed to solve the complex problem we have -## Reading other people code is painful +### Reading other people code is painful - The problem is that code reflects the thought process of the person who wrote the code @@ -134,7 +134,7 @@ - Maybe a hack solution needed to be added to ship and get the \$1m from the customers -## Suggestions on how to read code +### Suggestions on how to read code - Use `git blame` to understand who wrote the code and over what period of time - Knowing the author can help you ask him/her questions directly @@ -198,7 +198,7 @@ - The more code you read, the more comfortable you will become -# Refs +## Refs - [How to Read Code (Eight Things to Remember)](https://spin.atomicobject.com/2017/06/01/how-to-read-code`) - [Things you should never do](https://www.joelonsoftware.com/2000/04/06/things-you-should-never-do-part-i) diff --git a/docs/coding/all.run_jupyter_notebook.how_to_guide.md b/docs/coding/all.run_jupyter_notebook.how_to_guide.md index da82462dbb..9aa076cf49 100644 --- a/docs/coding/all.run_jupyter_notebook.how_to_guide.md +++ b/docs/coding/all.run_jupyter_notebook.how_to_guide.md @@ -1,4 +1,4 @@ - +# Run Jupyter notebook diff --git a/docs/coding/all.run_unit_tests.how_to_guide.md b/docs/coding/all.run_unit_tests.how_to_guide.md index c1a0c16d32..6af855a517 100644 --- a/docs/coding/all.run_unit_tests.how_to_guide.md +++ b/docs/coding/all.run_unit_tests.how_to_guide.md @@ -1,4 +1,4 @@ - +# Run Unit Tests @@ -32,13 +32,13 @@ -# Run unit tests +## Run unit tests - We use `pytest` and `unittest` as testing framework - Before any PR (and ideally after a few commits), we want to run all the unit tests to make sure we didn't introduce any new bugs -## Test lists +### Test lists - We have different test set lists: - `fast` @@ -57,7 +57,7 @@ - No time limit but we need to be judicious with length - Anything above 5-15 mins is problematic -## Using `invoke` +### Using `invoke` - [`invoke`](https://www.pyinvoke.org/) is a task execution framework which allows to execute some typical workflows in a simple way @@ -98,7 +98,7 @@ -v STRING, --version=STRING ``` -### Docker image stage and version +#### Docker image stage and version - To select a specific stage for Docker image use the `--stage` option. E.g., this might be useful when a user wants to run regressions on the local Docker @@ -115,12 +115,12 @@ > i run_fast_tests --stage local --version 1.0.4 ``` -### Specifying `pytest` options +#### Specifying `pytest` options - With the option `--pytest-opts` it is possible to pass any `pytest` option to `invoke`. -### Running in debug mode +#### Running in debug mode - If a user wants to run the tests in debug mode to show the output ```bash @@ -129,7 +129,7 @@ - This is equivalent to specifying `-v DEBUG` through the command line of one of the executables -### Save test output to a file +#### Save test output to a file - To save the output of `pytest` to `tmp.pytest.log` use the `--tee-to-file` option. @@ -137,14 +137,14 @@ > i run_fast_tests --tee-to-file ``` -### Show the tests but do not run +#### Show the tests but do not run - To list, but not run, the tests that will be executed, use `--collect-only`. ```bash > i run_fast_test --collect-only ``` -### Skip submodules +#### Skip submodules - To skip running tests in submodules, use the `--skip-submodules` option. - This option is useful in repos with Git submodules so that you can run only @@ -156,7 +156,7 @@ > i run_fast_tests --skip-submodules ``` -### Compute test coverage +#### Compute test coverage - To compute test coverage use the `--coverage` option @@ -164,7 +164,7 @@ > i run_fast_tests --coverage ``` -## Timeout +### Timeout - We use the [`pytest-timeout`](https://pypi.org/project/pytest-timeout/) package to limit durations of fast, slow, and superslow tests @@ -174,7 +174,7 @@ `set_up_test()` and `tear_down_test()` time, if they are run at the beginning/end of the methods -## Rerunning timeout-ed tests +### Rerunning timeout-ed tests - Running tests can take different amounts of time depending on workload and machine @@ -193,7 +193,7 @@ now due to [#693 (comment)](https://github.com/cryptokaizen/cmamp/issues/693#issuecomment-989456031) -## Compute test coverage +### Compute test coverage - The documentation for `coverage` is [here](https://coverage.readthedocs.io/en/latest/cmd.html#reporting). @@ -374,7 +374,7 @@ ![alt_text](figs/unit_tests/image_2.png) -### An example coverage session +#### An example coverage session - We want to measure the unit test coverage of `oms` component from both fast and slow tests @@ -478,7 +478,7 @@ Combined data file .coverage_slow_tests ``` -### An example with customized `pytest-cov` html run +#### An example with customized `pytest-cov` html run - We want to measure unit test coverage specifically for one test in `im_v2/common/data/transform/` and to save generated `htmlcov` in the same @@ -508,7 +508,7 @@ Coverage HTML written to dir im_v2/common/data/transform/htmlcov ``` -### Generate coverage report with `invoke` +#### Generate coverage report with `invoke` - One can compute test coverage for a specified directory and generate text and HTML reports automatically using `invoke task run_coverage_report` @@ -535,7 +535,7 @@ -t STRING, --target-dir=STRING ``` -#### Common usage +##### Common usage - Compute coverage for `market_data` dir, generate text and HTML reports and publish HTML report on S3 @@ -558,7 +558,7 @@ 20:08:53 - INFO lib_tasks.py _publish_html_coverage_report_on_s3:3679 HTML coverage report is published on S3: path=`s3://cryptokaizen-html/html_coverage/grisha_CmTask1038_Tool_to_extract_the_dependency_from_a_project` ``` -### Publishing HTML report on S3 +#### Publishing HTML report on S3 - To make a dir with the report unique, we decorate the dir with a linux user and a Git branch name, e.g., @@ -570,9 +570,9 @@ - E.g. [http://172.30.2.44/html_coverage/grisha_CmTask1038_Tool_to_extract_the_dependency_from_a_project/](http://172.30.2.44/html_coverage/grisha_CmTask1038_Tool_to_extract_the_dependency_from_a_project/) -# Running `pytest` directly +## Running `pytest` directly -## Usage and Invocations reference +### Usage and Invocations reference - See [`pytest` documentation](http://doc.pytest.org/en/latest/usage.html) - Some examples of useful command lines: @@ -604,9 +604,9 @@ > pytest --last-failed ``` -## Custom `pytest` options behaviors +### Custom `pytest` options behaviors -### Enable logging +#### Enable logging - To enable logging of `_LOG.debug` for a single test run: @@ -615,7 +615,7 @@ > pytest oms/test/test_broker.py::TestSimulatedBroker1 -s --dbg ``` -### Update golden outcomes +#### Update golden outcomes - This switch allows to overwrite the golden outcomes that are used as reference in the unit tests to detect failures @@ -624,7 +624,7 @@ > pytest --update_outcomes ``` -### Incremental test mode +#### Incremental test mode - This switch allows to reuse artifacts in the test directory and to skip the clean up phase @@ -636,7 +636,7 @@ > pytest --incremental ``` -## Debugging Notebooks +### Debugging Notebooks 1. Run a failing test with `-s --dbg` to get detailed logs - E.g., `> pytest core/plotting/test/test_gallery_notebook.py -s --dbg` @@ -653,12 +653,12 @@ - E.g., -# Running tests on GH Actions +## Running tests on GH Actions - The official documentation is [https://docs.github.com/en/actions](https://docs.github.com/en/actions) -## How to run a single test on GH Action +### How to run a single test on GH Action - Unfortunately, there is no way to log in and run interactively on GH machines. This is a feature requested but not implemented by GH yet. diff --git a/docs/coding/all.submit_code_for_review.how_to_guide.md b/docs/coding/all.submit_code_for_review.how_to_guide.md index 3549441ce7..af216de4cd 100644 --- a/docs/coding/all.submit_code_for_review.how_to_guide.md +++ b/docs/coding/all.submit_code_for_review.how_to_guide.md @@ -205,7 +205,7 @@ some helpful tips and resources to guide you through your first review. - What the problem is - Why the outcome is different from what you expected - E.g. on how to report any issues - - https://github.com/kaizen-ai/kaizenflow/issues/370#issue-1782574355 + - Https://github.com/kaizen-ai/kaizenflow/issues/370#issue-1782574355 ## Talk through code and not GitHub @@ -223,8 +223,8 @@ some helpful tips and resources to guide you through your first review. idea of what common issues are and how to address them. - Here are some links to a few "painful" first reviews: - Adding unit tests: - - https://github.com/kaizen-ai/kaizenflow/pull/166 - - https://github.com/kaizen-ai/kaizenflow/pull/186 + - Https://github.com/kaizen-ai/kaizenflow/pull/166 + - Https://github.com/kaizen-ai/kaizenflow/pull/186 - Writing scripts: - - https://github.com/kaizen-ai/kaizenflow/pull/267 - - https://github.com/kaizen-ai/kaizenflow/pull/276 + - Https://github.com/kaizen-ai/kaizenflow/pull/267 + - Https://github.com/kaizen-ai/kaizenflow/pull/276 diff --git a/docs/coding/all.type_hints.how_to_guide.md b/docs/coding/all.type_hints.how_to_guide.md index 3555a5c34a..2af0410d42 100644 --- a/docs/coding/all.type_hints.how_to_guide.md +++ b/docs/coding/all.type_hints.how_to_guide.md @@ -1,33 +1,35 @@ -# Type hints +# Type Hints + +## Type hints - - [Why we use type hints](#why-we-use-type-hints) - - [What to annotate with type hints](#what-to-annotate-with-type-hints) - - [Conventions](#conventions) - * [Empty return](#empty-return) - * [Invoke tasks](#invoke-tasks) - * [Annotation for kwargs](#annotation-for-kwargs) - * [Any](#any) - * [np.array and np.ndarray](#nparray-and-npndarray) - - [Handling the annoying Incompatible types in assignment](#handling-the-annoying-incompatible-types-in-assignment) - - [Handling the annoying "None" has no attribute](#handling-the-annoying-none-has-no-attribute) - - [Disabling mypy errors](#disabling-mypy-errors) - - [What to do when you don't know what to do](#what-to-do-when-you-dont-know-what-to-do) - - [Library without types](#library-without-types) - - [Inferring types using unit tests](#inferring-types-using-unit-tests) +- [Why we use type hints](#why-we-use-type-hints) +- [What to annotate with type hints](#what-to-annotate-with-type-hints) +- [Conventions](#conventions) + * [Empty return](#empty-return) + * [Invoke tasks](#invoke-tasks) + * [Annotation for `kwargs`](#annotation-for-kwargs) + * [`Any`](#any) + * [`np.array` and `np.ndarray`](#nparray-and-npndarray) +- [Handling the annoying `Incompatible types in assignment`](#handling-the-annoying-incompatible-types-in-assignment) +- [Handling the annoying `"None" has no attribute`](#handling-the-annoying-none-has-no-attribute) +- [Disabling `mypy` errors](#disabling-mypy-errors) +- [What to do when you don't know what to do](#what-to-do-when-you-dont-know-what-to-do) +- [Library without types](#library-without-types) +- [Inferring types using unit tests](#inferring-types-using-unit-tests) -# Why we use type hints +## Why we use type hints - We use Python 3 type hints to: - - Improve documentation - Allow mypy to perform static checking of the code, looking for bugs - - Enforce the type checks at run-time, through automatic assertions (not implemented yet) + - Enforce the type checks at run-time, through automatic assertions (not + implemented yet) -# What to annotate with type hints +## What to annotate with type hints - We expect all new library code (i.e., that is not in a notebook) to have type annotations @@ -36,37 +38,36 @@ can't infer the type - We strive to get no errors / warnings from the linter, including mypy -# Conventions +## Conventions -## Empty return +### Empty return - Return `-> None` if your function doesn't return - - Pros: - - `mypy` checks functions only when there is at least an annotation: so using - `-> None` enables mypy to do type checking + - `mypy` checks functions only when there is at least an annotation: so + using `-> None` enables mypy to do type checking - It reminds us that we need to use type hints - Cons: - `None` is the default value and so it might seem redundant -## Invoke tasks +### Invoke tasks - For some reason `invoke` does not like type hints, so we - - - Omit type hints for `invoke` tasks, i.e. functions with the `@task` decorator + - Omit type hints for `invoke` tasks, i.e. functions with the `@task` + decorator - Put `# type: ignore` so that `mypy` does not complain - Example: ```python - @task - def run_qa_tests( # type: ignore - ctx, - stage="dev", - version="", - ): + @task + def run_qa_tests( # type: ignore + ctx, + stage="dev", + version="", + ): ``` -## Annotation for `kwargs` +### Annotation for `kwargs` - We use `kwargs: Any` and not `kwargs: Dict[str, Any]` - `*` always binds to a `Tuple`, and `**` always binds to a `Dict[str, Any]`. @@ -75,24 +76,24 @@ `Tuple[_, ...]` and `Dict[str, _]` container types. - [Reference article](https://adamj.eu/tech/2021/05/11/python-type-hints-args-and-kwargs/) -## `Any` +### `Any` - `Any` type hint = no type hint - We try to avoid it everywhere when possible -## `np.array` and `np.ndarray` +### `np.array` and `np.ndarray` - If you get something like the following lint: ```bash - dataflow/core/nodes/sklearn_models.py:537:[amp_mypy] error: Function "numpy.core.multiarray.array" is not valid as a type [valid-type] + dataflow/core/nodes/sklearn_models.py:537:[amp_mypy] error: Function "numpy.core.multiarray.array" is not valid as a type [valid-type] ``` -- Then the problem is probably that a parameter that the lint is related to - has been typed as `np.array` while it should be typed as `np.ndarray`: +- Then the problem is probably that a parameter that the lint is related to has + been typed as `np.array` while it should be typed as `np.ndarray`: ```python `x_vals: np.array` -> `x_vals: np.ndarray` ``` -# Handling the annoying `Incompatible types in assignment` +## Handling the annoying `Incompatible types in assignment` - `mypy` assigns a single type to each variable for its entire scope - The problem is in common idioms where we use the same variable to store @@ -135,7 +136,7 @@ test_func(arg=cast(bool, var)) ``` -# Handling the annoying `"None" has no attribute` +## Handling the annoying `"None" has no attribute` - In some model classes `self._model` parameter is being assigned to `None` in ctor and being set after calling `set_fit_state` method @@ -151,16 +152,16 @@ ``` - A solution is to - Type hint when assigning the model parameter in ctor: - ```python - self._model: Optional[sklearn.base.BaseEstimator] = None - ``` + ```python + self._model: Optional[sklearn.base.BaseEstimator] = None + ``` - Cast a type to the model parameter after asserting that it is not `None`: - ```python - hdbg.dassert_is_not(self._model, None) - self._model = cast(sklearn.base.BaseEstimator, self._model) - ``` + ```python + hdbg.dassert_is_not(self._model, None) + self._model = cast(sklearn.base.BaseEstimator, self._model) + ``` -# Disabling `mypy` errors +## Disabling `mypy` errors - If `mypy` reports an error and you don't understand why, please ping one of the python experts asking for help @@ -177,17 +178,19 @@ from pyannotate_runtime import collect_types # type: ignore ``` -# What to do when you don't know what to do +## What to do when you don't know what to do -- Go to the [`mypy` official cheat sheet](https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html) +- Go to the + [`mypy` official cheat sheet](https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html) - Use `reveal_type` - To find out what type `mypy` infers for an expression anywhere in your program, wrap it in `reveal_type()` - `mypy` will print an error message with the type; remove it again before running the code - - See [the official `mypy` documentation](https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html#when-you-re-puzzled-or-when-things-are-complicated) + - See + [the official `mypy` documentation](https://mypy.readthedocs.io/en/stable/cheat_sheet_py3.html#when-you-re-puzzled-or-when-things-are-complicated) -# Library without types +## Library without types - `mypy` is unhappy when a library doesn't have types - Lots of libraries are starting to add type hints now that python 2 has been @@ -205,14 +208,13 @@ > cp mypy.ini amp/mypy.ini ``` -# Inferring types using unit tests +## Inferring types using unit tests - Sometimes it is possible to infer types directly from unit tests. We have used -this flow to annotate the code when we switched to Python3 and it worked fine -although there were various mistakes. We still prefer to annotate by hand based -on what the code is intended to do, rather than automatically infer it from how -the code behaves. - + this flow to annotate the code when we switched to Python3 and it worked fine + although there were various mistakes. We still prefer to annotate by hand + based on what the code is intended to do, rather than automatically infer it + from how the code behaves. - Install `pyannotate` ```bash > pip install pyannotate diff --git a/docs/coding/all.write_unit_tests.how_to_guide.md b/docs/coding/all.write_unit_tests.how_to_guide.md index e54a66b780..37edd22974 100644 --- a/docs/coding/all.write_unit_tests.how_to_guide.md +++ b/docs/coding/all.write_unit_tests.how_to_guide.md @@ -1,4 +1,4 @@ - +# Write Unit Tests @@ -57,9 +57,9 @@ -# Guidelines about writing unit tests +## Guidelines about writing unit tests -## What is a unit test? +### What is a unit test? - A unit test is a small, self-contained test of a (public) function or method of a library @@ -68,7 +68,7 @@ - Running the test ensures that the actual output agrees with the expected output -## Why is unit testing important? +### Why is unit testing important? - Good unit testing improves software quality by: - Eliminating bugs (obvious) @@ -76,7 +76,7 @@ - Making refactoring safer and easier ("Refactor Early, Refactor Often") - Documenting expected behavior and usage -## The Pragmatic Programming and unit testing +### The Pragmatic Programming and unit testing - Unit testing is an integral part of [The Pragmatic Programming](https://pragprog.com/titles/tpp20/the-pragmatic-programmer-20th-anniversary-edition/) @@ -94,46 +94,36 @@ - Test Early. Test Often. Test Automatically. - Use Saboteurs to Test Your Testing - Find Bugs Once -- Good unit testing improves software quality. It does this in part by - - Eliminating bugs (obvious) - - Clarifying code design and interfaces ("Design to Test") - - Making refactoring safer and easier ("Refactor Early, Refactor Often") - - Documenting expected behavior and usage -## Unit testing tips +### Unit testing tips -### Test one thing +#### Test one thing - A good unit test tests only one thing - Testing one thing keeps the unit test simple, relatively easy to understand, and helps isolate the root cause when the test fails - How do you test more than one thing? By having more than one unit test! -### Keep tests self-contained +#### Keep tests self-contained - A unit test should be independent of all other unit tests - Each test should be self-sufficient - One should never assume that unit tests will be executed in a particular order - A corollary of keeping tests self-contained is to keep all information needed to understand the test within the test itself -- In other words, when possible, avoid calling helper functions to load data or - state to initialize the test; instead, specify the data explicitly in the test - where it is used + - Specify the data explicitly in the test where it is used - This makes the test easier to understand and easier to debug when it fails - If multiple unit tests use or can use the same initialization data, do not hesitate repeating it in each test (or consider using parameterized testing) -### Only specify data related to what is being tested +#### Only specify data related to what is being tested -- If a function that is being tested supports optional arguments, but those - optional arguments are not needed for a particular unit test, then do not - specify them in the test - Specify the minimum of what is required to test what is being tested - E.g., if a function that is being tested supports optional arguments, but those optional arguments are not needed for a particular unit test, then do not specify them in the test -### Test realistic corner cases +#### Test realistic corner cases - Can the function receive an empty list? - Can it return an empty Series? @@ -143,13 +133,13 @@ - Expect these questions to come up in practice and think through what the appropriate behavior should be. Then, test for it. -### Test a typical scenario +#### Test a typical scenario - In ensuring that corner cases are covered, do not overlook testing basic functionality for typical cases - This is useful for verifying current behavior and to support refactoring. -### Test executable scripts end-to-end +#### Test executable scripts end-to-end - In some cases, like scripts, it is easy to get lost chasing the coverage % - E.g., covering every line of the original, including the parser @@ -163,9 +153,9 @@ - A good practice is to have a `_run()` function that does all the job and `_main()` only brings together the parser and the executable part -## Conventions +### Conventions -### Naming and placement conventions +#### Naming and placement conventions - We follow conventions that happen to be mostly the default to `pytest` @@ -214,13 +204,13 @@ - Split test classes and methods in a reasonable way so each one tests one single thing in the simplest possible way -### Keep testing code in sync with tested code +#### Keep testing code in sync with tested code - If you change the name of a tested class, also the test should be changed - If you change the name of a file also the name of the file with the testing code should be changed -### Test code is not second-class citizen +#### Test code is not second-class citizen - Test code is not second-class citizen, even though it's auxiliary to the code @@ -229,7 +219,7 @@ - Avoid repetition in test code, but use helper to factor out common code - Abhor copy-paste and keep the code DRY -### Testing code layout +#### Testing code layout - The layout of a test dir should look like: ```bash @@ -246,7 +236,7 @@ test_system_interaction.py ``` -### Our framework to test using input / output data +#### Our framework to test using input / output data - `helpers/unit_test.py` has some utilities to create input and output easily dirs storing data for unit tests @@ -266,22 +256,7 @@ └── output ``` -- The layout of test dir: - ```bash - > ls -1 helpers/test/ - Test_dassert1.test2 - Test_dassert1.test3 - Test_dassert1.test4 - ... - Test_dassert_misc1.test6 - Test_dassert_misc1.test8 - Test_system1.test7 - test_dbg.py - test_helpers.py - test_system_interaction.py - ``` - -### Use text and not pickle files as input/outputs +#### Use text and not pickle files as input/outputs - The problems with pickle files are the usual ones - Pickle files are not stable across different versions of libraries @@ -294,7 +269,7 @@ - Document how it was generated - Even better, add a test that generates the data -### Small testing data is best +#### Small testing data is best - Use a subset of the input data @@ -307,16 +282,16 @@ Last review: GP on 2024-05-13 -### `check_string` vs `self.assertEqual` +#### `check_string` vs `self.assertEqual` - TODO(gp): Add -### Use `self.assert_equal()` +#### Use `self.assert_equal()` - This is a function that helps you understand what the mismatches are - It works on `str` -### How to split unit test code in files +#### How to split unit test code in files - The two extreme approaches are: - All the test code for a directory goes in one file @@ -348,7 +323,7 @@ Last review: GP on 2024-05-13 - So it's easy to find which file is tested were using grep - Then split when it becomes too big using `test_$FILENAME.py` -### Skeleton for unit test +#### Skeleton for unit test - Interesting unit tests are in `helpers/test` - A unit test looks like: @@ -364,7 +339,7 @@ Last review: GP on 2024-05-13 unittest.main() ``` -### Hierarchical `TestCase` approach +#### Hierarchical `TestCase` approach - Whenever there is a hierarchy in classes, we also create a hierarchy of test classes @@ -405,20 +380,20 @@ Last review: GP on 2024-05-13 - As an example, see `im_v2/common/data/client/test/im_client_test_case.py` and `im_v2/ccxt/data/client/test/test_ccxt_clients.py` -### Use the appropriate `self.assert*` +#### Use the appropriate `self.assert*` - When you get a failure, you don't want to get something like "True is not False", rather an informative message like "5 is not < 4" - Bad `self.assertTrue(a < b)` - Good `self.assertLess(a, b)` -### Do not use `hdbg.dassert` in testing +#### Do not use `hdbg.dassert` in testing - `dassert`s are for checking the self-consistency of the code - The invariant is that you can remove `dbg.dassert` without changing the code's behavior. Of course, you can't remove the assertion and get unit tests to work -### Always explain `self.assertRaises` +#### Always explain `self.assertRaises` - Testing for an assertion needs to always be done with the following idiom to explain exactly what we are catching and why @@ -429,23 +404,20 @@ Last review: GP on 2024-05-13 ) act = str(cm.exception) exp = r""" - * Failed assertion * - '0' - == - '1' - Specify only one among --modified, --branch, --last-commit - """ - self.assert_equal(act, exp, fuzzy_match=True) + ``` +* Failed assertion \* '0' == '1' Specify only one among --modified, + --branch, --last-commit """ self.assert_equal(act, exp, fuzzy_match=True) + ``` ``` -### Interesting testing functions +#### Interesting testing functions - List of useful testing functions are: - [General python](https://docs.python.org/2/library/unittest.html#test-cases) - [Numpy](https://docs.scipy.org/doc/numpy-1.15.0/reference/routines.testing.html) - [Pandas](https://pandas.pydata.org/pandas-docs/version/0.21/api.html#testing-functions) -### Use set_up_test / tear_down_test +#### Use set_up_test / tear_down_test - If you have a lot of repeated code in your tests, you can make them shorter by moving this code to `set_up_test/tear_down_test` methods: @@ -513,7 +485,7 @@ Last review: GP on 2024-05-13 `super().setUp()`/`super.tearDown()`, then `setUp()`/`tearDown()` can be discarded completely. -#### Nested set_up_test / tear_down_test +##### Nested set_up_test / tear_down_test - When a test class (e.g., TestChild) inherits from another test class (e.g., TestParent), `setUp()`/`tearDown()` methods in the child class normally @@ -793,7 +765,7 @@ Last review: GP on 2024-05-13 ... ``` -### Use setUpClass / tearDownClass +#### Use setUpClass / tearDownClass - If you need some expensive code parts to be done once for the whole test class, such as opening a database connection, opening a temporary file on the @@ -827,7 +799,7 @@ Last review: GP on 2024-05-13 - For more information, see [official unittest docs](https://docs.python.org/3/library/unittest.html) -# Update test tags +## Update test tags - There are 2 files with the list of tests' tags: - `amp/pytest.ini` @@ -837,16 +809,16 @@ Last review: GP on 2024-05-13 - After a `:` add a short description - Keep tags in the alphabetical order -# Mocking +## Mocking -## Refs +### Refs - Introductory article is [https://realpython.com/python-mock-library/ ](https://realpython.com/python-mock-library/) - Official Python documentation for the mock package can be seen here [unit test mock](https://docs.python.org/3/library/unittest.mock.html) -## Common usage samples +### Common usage samples It is best to apply on any part that is deemed unnecessary for specific test @@ -864,7 +836,7 @@ It is best to apply on any part that is deemed unnecessary for specific test - Many more possible combinations can be seen in the official documentation. - Below are the most common ones for basic understanding. -## Philosophy about mocking +### Philosophy about mocking 1. We want to mock the minimal surface of a class - E.g., assume there is a class that is interfacing with an external provider @@ -881,9 +853,9 @@ It is best to apply on any part that is deemed unnecessary for specific test - We want to test the minimal amount of behavior that enforces what we care about -## Some general suggestions about testing +### Some general suggestions about testing -### Test from the outside-in +#### Test from the outside-in - We want to start testing from the end-to-end methods towards the constructor of an object @@ -893,7 +865,7 @@ It is best to apply on any part that is deemed unnecessary for specific test - Use the code coverage to see what's left to test once you have tested the "most external" code -### We don't need to test all the assertions +#### We don't need to test all the assertions - E.g., testing carefully that we can't pass a value to a constructor doesn't really test much besides the fact that `dassert` works (which, surprisingly @@ -901,7 +873,7 @@ It is best to apply on any part that is deemed unnecessary for specific test - We don't care about line coverage or checking boxes for the sake of checking boxes -### Use strings to compare output instead of data structures +#### Use strings to compare output instead of data structures - Often, it's easier to do a check like: @@ -926,7 +898,7 @@ It is best to apply on any part that is deemed unnecessary for specific test - In case of mismatch, it's easier to update the string with copy-paste rather than creating a data structure that matches what was created -### Use `self.check_string()` for things that we care about not changing (or are too big to have as strings in the code) +#### Use `self.check_string()` for things that we care about not changing (or are too big to have as strings in the code) - Use `self.assert_equal()` for things that should not change (e.g., 1 + 1 = 2) - When using `check_string` still try to add invariants that force the code to @@ -936,18 +908,18 @@ It is best to apply on any part that is deemed unnecessary for specific test timestamps than 0 to avoid the situation where we update the string to something malformed -### Each test method should test a single test case +#### Each test method should test a single test case - Rationale: we want each test to be clear, simple, fast - If there is repeated code we should factor it out (e.g., builders for objects) -### Each test should be crystal clear on how it is different from the others +#### Each test should be crystal clear on how it is different from the others - Often, you can factor out all the common logic into a helper method - Copy-paste is not allowed in unit tests in the same way it's not allowed in production code -### In general, you want to budget the time to write unit tests +#### In general, you want to budget the time to write unit tests - E.g., "I'm going to spend 3 hours writing unit tests". This is going to help you focus on what's important to test and force you to use an iterative @@ -955,11 +927,11 @@ It is best to apply on any part that is deemed unnecessary for specific test ![alt_image](figs/unit_tests/image_4.png) -### Write a skeleton of unit tests and ask for a review if you are not sure how what to test +#### Write a skeleton of unit tests and ask for a review if you are not sure how what to test - Aka "testing plan" -## Object patch with return value +### Object patch with return value ```python import unittest.mock as umock @@ -982,7 +954,7 @@ def test_function_call1(self, mock_get_secret: umock.MagicMock): before mocks for test are applied - On every call, it returns string "dummy" -## Path patch with multiple return values +### Path patch with multiple return values ```python import unittest.mock as umock @@ -996,7 +968,7 @@ mock_get_secret.side_effect = ["dummy", Exception] - On first call, string `dummy` is returned - On second, `Exception` is raised -## Ways of calling `patch` and `patch.object` +### Ways of calling `patch` and `patch.object` - Via decorator ```python @@ -1024,7 +996,7 @@ mock_get_secret.side_effect = ["dummy", Exception] decorator and we do not need to worry about reverting the patch changes as that is automatically done at the end of with statement -## Mock object state after test run +### Mock object state after test run ```python @umock.patch.object(exchange_class._exchange, "fetch_ohlcv") @@ -1063,7 +1035,7 @@ def test_function_call1(self, fetch_ohlcv_mock: umock.MagicMock): function regardless of how many times it is called - Useful for verifying that args passed are changing as expected -## Mock common external calls in `hunitest.TestCase` class +### Mock common external calls in `hunitest.TestCase` class ```python class TestCcxtExtractor1(hunitest.TestCase): @@ -1102,15 +1074,15 @@ class TestCcxtExtractor1(hunitest.TestCase): separately. We want to avoid that and only start/stop same patch for each test. -## Mocks with specs +### Mocks with specs ```python -# Regular mock and external library `ccxt` is replaced with `MagicMock` +## Regular mock and external library `ccxt` is replaced with `MagicMock` @umock.patch.object(ivcdexex, "ccxt") -# Only `ccxt` is spec'd, not actual components that are "deeper" in the `ccxt` library. +## Only `ccxt` is spec'd, not actual components that are "deeper" in the `ccxt` library. @umock.patch.object(ivcdexex, "ccxt", spec=ivcdexex.ccxt) -# Everything is spec'd recursively , including returning values/instances of `ccxt` -# functions and returned values/instances of returned values/instances, etc. +## Everything is spec'd recursively , including returning values/instances of `ccxt` +## functions and returned values/instances of returned values/instances, etc. @umock.patch.object(ivcdexex, "ccxt", autospec=True) ``` @@ -1127,11 +1099,11 @@ class TestCcxtExtractor1(hunitest.TestCase): - As newly `exchange` instance is with spec, we can only call real functions/attributes of `ccxt.Exchange` class -## Caveats +### Caveats ```python -# `datetime.now` cannot be patched directly, as it is a built-in method. -# Error: "can't set attributes of built-in/extension type 'datetime.datetime'" +## `datetime.now` cannot be patched directly, as it is a built-in method. +## Error: "can't set attributes of built-in/extension type 'datetime.datetime'" datetime_patch = umock.patch.object(imvcdeexut, "datetime", spec=imvcdeexut.datetime) ``` diff --git a/docs/dataflow/all.batch_and_streaming_mode_using_tiling.explanation.md b/docs/dataflow/all.batch_and_streaming_mode_using_tiling.explanation.md index 624c2af3c1..25bae5a5e7 100644 --- a/docs/dataflow/all.batch_and_streaming_mode_using_tiling.explanation.md +++ b/docs/dataflow/all.batch_and_streaming_mode_using_tiling.explanation.md @@ -1,4 +1,4 @@ - +# Batch And Streaming Mode Using Tiling @@ -12,7 +12,7 @@ -# The property of tilability +## The property of tilability The working principle of a DataFlow computation is that nodes should be able to compute their outputs from their inputs without a dependency on how the inputs @@ -61,7 +61,7 @@ exponentially weighted moving average for adjacent intervals of times), then it can be made tileable by adding auxiliary state to store the partial amount of computatin. -## Temporal tiling +### Temporal tiling In most computations there is a special axis that represents time and moves only from past to future. The data along other axes represent (potentially @@ -162,7 +162,7 @@ described above is general, within any desired level of approximation. The amount of history is function of a node -## Cross-sectional tiling +### Cross-sectional tiling - The same principle can be applied to tiling computation cross-sectionally @@ -170,20 +170,20 @@ The amount of history is function of a node correct - TODO(gp): Make an example -## Temporal and cross-sectional tiling +### Temporal and cross-sectional tiling - These two styles of tiling can be composed - The tiling doesn't even have to be regular, as long as the constraints for a correct computation are correct -## Detecting incorrect tiled computations +### Detecting incorrect tiled computations - One can use the tiling invariance of a computation to verify that it is correct - E.g., if computing a DAG gives different results for different tiled, then the amount of history to each node is not correct -## Benefits of tiled computation +### Benefits of tiled computation - Another benefit of tiled computation is that future peeking (i.e., a fault in a computation that requires data not yet available at the computation time) @@ -199,7 +199,7 @@ transformation without altering the computation, e.g., - Parallelization of tiles and nodes across different CPUs - Select the size of a tile so that the computation fits in memory -# Batch vs streaming +## Batch vs streaming Once a computation can be tiled, the same computation can be performed in batch mode (e.g., the entire data set is processed at once) or in streaming mode diff --git a/docs/dataflow/all.best_practice_for_building_dags.explanation.md b/docs/dataflow/all.best_practice_for_building_dags.explanation.md index e30f0c402a..db8c44de41 100644 --- a/docs/dataflow/all.best_practice_for_building_dags.explanation.md +++ b/docs/dataflow/all.best_practice_for_building_dags.explanation.md @@ -1,4 +1,4 @@ - +# Best Practice For Building Dags @@ -25,9 +25,9 @@ -# Config +## Config -## Config builders +### Config builders `Config`s can be built through functions that can complete a "template" config with some parameters passed from the user @@ -47,9 +47,9 @@ Config builders can be nested. You can use put `nid_prefix` in the `DagBuilder` constructor, since `nid_prefix` acts as a namespace to avoid `nid` collisions -# DAG builders +## DAG builders -## DAG builder methods +### DAG builder methods - DAG builders accept a Config and return a DAG - E.g., @@ -80,7 +80,7 @@ DAG builders give meaningful `nid` names to their nodes. Collisions in graphs built from multiple builders are avoided by the user through the judicious use of namespace-like nid prefixes. -## DAG and Nodes +### DAG and Nodes The DAG structure does not know about what data is exchanged between nodes. @@ -97,7 +97,7 @@ The DAG `node`s are wrappers around Pandas dataframes instruments and multiple features per instruments), the node should, assuming a well-defined DAG and config, know how to melt and pivot columns -## Keeping `config` and `DagBuilder` in sync +### Keeping `config` and `DagBuilder` in sync - `Config` asserts if a `DagBuilder` tries to access a hierarchical parameter that doesn't exist and reports a meaningful error of what the problem is @@ -106,19 +106,19 @@ The DAG `node`s are wrappers around Pandas dataframes reports a warning for all the parameters that were not used - This is mostly for a sanity check and debugging, so we don't assert -## `DagBuilder` idiom +### `DagBuilder` idiom When we build DAGs we use `DagBuilder` that call a constructor from `get_dag()` with params from the `get_config()` ``` dag_builder = DagBuilder() template_config = dag_builder.get_template_config() -# Complete the config. +## Complete the config. config = template_config[...] dag = dag_builder.get_dag(config) ``` -## Invariants +### Invariants Nodes of the DAG propagate dataframes @@ -136,7 +136,7 @@ We assume that dataframes are aligned in terms of timescale - When data sources have different time resolutions, typically we perform outer merges either leaving nans or filling with forward fills -## Make code easy to wrap code into `Nodes` +### Make code easy to wrap code into `Nodes` We strive to write functions (e.g., from `signal_processing.py`) that: @@ -151,7 +151,7 @@ We strive to write functions (e.g., from `signal_processing.py`) that: - E.g., refer to `process_outliers()` as an example -## `ColumnTransformer` +### `ColumnTransformer` `ColumnTransformer` is a very flexible `Node` class that can wrap a wide variety of functions @@ -167,7 +167,7 @@ of functions - `DataframeMethodRunner` can run any `pd.DataFrame` method supported and forwards kwargs -## One vs multiple graphs +### One vs multiple graphs - We still don't have a final answer about this design issue - Pros of one graph: @@ -180,7 +180,7 @@ of functions - One connected component (instead of a number depending upon the number of tickers) -## How to handle multiple features for a single instrument +### How to handle multiple features for a single instrument - E.g., `close` and `volume` for a single futures instrument - In this case we can use a dataframe with two columns `close_price` and @@ -191,20 +191,20 @@ of functions - If close_price and volume are "independent", they should go in different branches of the graph using a "Y" split -## How to handle multiple instruments? +### How to handle multiple instruments? - E.g., `close` price for multiple futures - We pass a dataframe with one column per instrument - All the transformations are then performed on a column-basis - We assume that the timeseries are aligned explicitly -## How to handle multiple features with multiple instruments +### How to handle multiple features with multiple instruments - E.g., close price, high price, volume for multiple energy futures instrument - In this case we can use a dataframe with hierarchical columns, where the first dimension is the instrument, and the second dimension is the feature -## Irregular DAGs +### Irregular DAGs - E.g., if we have 10 instruments that need to use different models, we could build a DAG, instantiating 10 different pipelines @@ -217,7 +217,7 @@ of functions - E.g,. if the computation is the same up to until a point, vectorize the common part, and then split the dataframe and use different pipelines -## Namespace vs hierarchical config +### Namespace vs hierarchical config - We recognize that sometimes we might want to call the same `DagBuilder` function multiple times (e.g., a DAG that is built with a loop) @@ -225,7 +225,7 @@ of functions node with a tag to make them unique or use hierarchical DAG - It seems simpler to use prefix for the tags, which is supported -## How to know what is configurable +### How to know what is configurable - By design, DataFlow can loosely wrap Python functions @@ -252,7 +252,7 @@ of functions config to configurable functions. This ability is more important than making it easy to expose all possible configuration parameters. -## DAG extension vs copying +### DAG extension vs copying - Currently DAG builders are chained by progressively extending an existing DAG @@ -279,7 +279,7 @@ of functions - Extending DAGs node by node is in fact how they are built under the hood -## Reusing parameters across nodes' configs +### Reusing parameters across nodes' configs - The same parameter might need to be used by different objects / functions and DAG nodes and kept in sync somehow @@ -291,7 +291,7 @@ of functions - Solution #2: - A "meta_parameter" Config key with all the parameters used by multiple nodes -## Composing vs deriving objects +### Composing vs deriving objects We have a lot of composition of objects to create specialized versions of objects E.g., there is an `HistoricalDataSource` node that allows to connect an diff --git a/docs/dataflow/all.computation_as_graphs.explanation.md b/docs/dataflow/all.computation_as_graphs.explanation.md index c53387c504..ae04b245d1 100644 --- a/docs/dataflow/all.computation_as_graphs.explanation.md +++ b/docs/dataflow/all.computation_as_graphs.explanation.md @@ -1,4 +1,4 @@ - +# Computation As Graphs @@ -15,9 +15,9 @@ -# KaizenFlow computing +## KaizenFlow computing -## Introduction +### Introduction `KaizenFlow` is a computing framework to build and test AI/machine learning models that can run: @@ -44,7 +44,7 @@ Some of the advantages of the DataFlow approach are: detection of future peeking - Ability to replay and debug model executions -## DAG Node +### DAG Node A DAG Node is a unit of computation in the DataFlow model. @@ -62,7 +62,7 @@ A DAG Node has: TODO(gp): circle with inputs and outputs -### DAG node examples +#### DAG node examples Examples of operations that may be performed by nodes include: @@ -103,7 +103,7 @@ A DagConfig is hierarchical and contains one subconfig per DAG node. It should only include `Dag` node configuration parameters, and not information about `Dag` connectivity, which is specified in the `Dag` builder part. -## DataFrame as unit of computation +### DataFrame as unit of computation The basic unit of computation of each node is a "dataframe". Each node takes multiple dataframes through its inputs, and emits one or more dataframes as @@ -157,9 +157,9 @@ Some characteristics of dataframes are: placeholder. - They provide tools to handle, fill, or remove missing data. -# DAG execution +## DAG execution -## Simulation kernel +### Simulation kernel A computation graph is a directed graph where nodes represent operations or variables, and edges represent dependencies between these operations. @@ -170,7 +170,7 @@ operations need to be completed before others. KaizenFlow simulation kernel schedules nodes according to their dependencies. -## Implementation of simulation kernel +### Implementation of simulation kernel The most general case of simulation consists of multiple loops: @@ -222,7 +222,7 @@ component in a trading system, where a DAG computes forecasts which are acted upon based on the available funds). In this case, the simulation kernel needs to enforce dependencies in the time dimension. -## Nodes ordering for execution +### Nodes ordering for execution TODO(gp, Paul): Extend this to the multiple loop. @@ -250,7 +250,7 @@ def topological_sort(graph): return post_order[::-1] # Reverse the post-order to get the topological order ``` -## Heuristics for splitting code in nodes +### Heuristics for splitting code in nodes There are degrees of freedom in splitting the work between various nodes of a graph E.g., the same DataFlow computation can be described with several nodes or diff --git a/docs/dataflow/all.dataflow.explanation.md b/docs/dataflow/all.dataflow.explanation.md index 33ace5c8d2..148695c3a0 100644 --- a/docs/dataflow/all.dataflow.explanation.md +++ b/docs/dataflow/all.dataflow.explanation.md @@ -2,16 +2,11 @@ - * [Config](#config) - + [Config representation and properties](#config-representation-and-properties) - + [Assigning and getting Config items](#assigning-and-getting-config-items) - * [Time semantics](#time-semantics) - * [Different views of System components](#different-views-of-system-components) - * [Architecture](#architecture) - + [Component invariants](#component-invariants) - * [DataFlow computing](#dataflow-computing) - + [Template configs](#template-configs) - * [DataFlow Computation Semantics](#dataflow-computation-semantics) +- [Different views of System components](#different-views-of-system-components) +- [Architecture](#architecture) + * [Component invariants](#component-invariants) +- [DataFlow computing](#dataflow-computing) + * [Template configs](#template-configs) @@ -52,17 +47,17 @@ data became available to that component) in terms of current time. Each component has a way to know: -- what is the current time (e.g., the real-time machine time or the simulated +- What is the current time (e.g., the real-time machine time or the simulated one) -- the timestamp of the current data bar it's working on +- The timestamp of the current data bar it's working on Each component -- should print its state so that one can inspect how exactly it has been +- Should print its state so that one can inspect how exactly it has been initialized -- can be serialized and deserialized from disk -- can be mocked for simulating -- should save data in a directory as it executes to make the system observable +- Can be serialized and deserialized from disk +- Can be mocked for simulating +- Should save data in a directory as it executes to make the system observable Models are described in terms of DAGs using the DataFlow framework @@ -88,7 +83,6 @@ vwap_approach_2.head(3) - TODO(gp): Explain this piece of code - ### Template configs - Are incomplete configs, with some "mandatory" parameters unspecified but @@ -107,9 +101,9 @@ in sync. The client: -- calls `get_config_template()` to receive the template config -- fills / modifies the config -- uses the final config to call `get_dag(config)` and get a fully built DAG +- Calls `get_config_template()` to receive the template config +- Fills / modifies the config +- Uses the final config to call `get_dag(config)` and get a fully built DAG A `DagBuilder` can be passed to other objects instead of `Dag` when the template config is fully specified and thus the `Dag` can be constructed from it. @@ -124,5 +118,3 @@ e.g., rolling pattern - `IncrementalDagRunner`: allows to run one step at a time like in real-time - `RealTimeDagRunner`: allows to run using nodes that have a real-time semantic - - diff --git a/docs/dataflow/all.dataflow_data_format.explanation.md b/docs/dataflow/all.dataflow_data_format.explanation.md index 5e47caa3f1..ee9c74427f 100644 --- a/docs/dataflow/all.dataflow_data_format.explanation.md +++ b/docs/dataflow/all.dataflow_data_format.explanation.md @@ -1,12 +1,10 @@ - +# DataFlow Data Format -- [DataFlow Data Format](#dataflow-data-format) - -## DataFlow Data Format + As explained in [/docs/datapull/all.datapull_client_stack.explanation.md](/docs/datapull/all.datapull_client_stack.explanation.md), diff --git a/docs/dataflow/all.time_series.explanation.md b/docs/dataflow/all.time_series.explanation.md index 40d3aa457c..bd48ed1090 100644 --- a/docs/dataflow/all.time_series.explanation.md +++ b/docs/dataflow/all.time_series.explanation.md @@ -1,4 +1,4 @@ - +# Time series diff --git a/docs/dataflow/all.timing_semantic_and_clocks.md b/docs/dataflow/all.timing_semantic_and_clocks.md index 8468ab6605..7839ffdf9e 100644 --- a/docs/dataflow/all.timing_semantic_and_clocks.md +++ b/docs/dataflow/all.timing_semantic_and_clocks.md @@ -1,4 +1,4 @@ - +# Timing Semantic And Clocks @@ -19,7 +19,7 @@ -# Time semantics +## Time semantics **Time semantics**. Any DataFlow component can be executed in real-time or simulated accounting for different ways to represent the passing of time. @@ -92,9 +92,9 @@ TODO(Grisha): add an example. same "format" and with the same timing as it would be provided in real-time, but the clock type is "replayed clock". -# How clock is handled +## How clock is handled -## Asynchronous mode +### Asynchronous mode - In asynchronous mode there are multiple things happening at the same time - E.g., DAG computes, orders are sent to the market, some components wait @@ -104,12 +104,12 @@ the clock type is "replayed clock". - E.g, one CPU executes/simulates the DAG, another CPU executes/simulates the `Portfolio`, etc. -## Synchronous mode +### Synchronous mode - In synchronous mode only one thing happens at the same time - E.g., executing a piece of code using Pandas -## Async vs sync simulation +### Async vs sync simulation - We can simulate the same system in sync or async mode @@ -124,7 +124,7 @@ the clock type is "replayed clock". - Under certain constraints (e.g., when I/O overlaps with computation) a single CPU can run/simulate a truly asynchronous system -## Some cross-products of the 3 directions +### Some cross-products of the 3 directions - Not all the combinations are possible of mixing: - Historical vs replayed vs real-time @@ -173,7 +173,7 @@ A system is composed of - In fact we have a loop that does exactly that - We can run the portfolio in "debug mode" where we have a precomputed df -## Research mode +### Research mode - Run the DAG without `process_forecast` - Save ResultBundles @@ -182,29 +182,29 @@ A system is composed of - Dot product + volatility normalization + target GMV + other magic - TODO(Paul): to formally defined -## Real-time mode +### Real-time mode - Run DAG one step at the time using RealTimeDataSource and MarketDataInterface - Save ResultBundle / intermediate state - Compute rolling pnl -## Historical +### Historical - Do all the predictions and then run the SimulatedPortfolio (DataFramePortfolio) one step at the time - Maybe useful for "looping" around the Optimizer -# Flows +## Flows - Evaluating a model requires computing forecasts and then the corresponding PnL -## Forecast flow +### Forecast flow - = compute forecasts from data using the DAG - It can be historical, replayed, real-time - The outcome is a data frame with all the forecasts -## Pnl (profit and loss) flow +### Pnl (profit and loss) flow - = given forecasts and prices compute the corresponding PnL - It can be computed using: @@ -217,12 +217,12 @@ A system is composed of Some configurations are used more often than others, and so we give them a specific name -## Research flow +### Research flow - = historical flow for computing + dot product - We use it to assess the presence of alpha -## Real-time flow +### Real-time flow - All components are: - Vendor-specific implemented (e.g., TalosImClient, TalosBroker) diff --git a/docs/datapull/all.data_schema.explanation.md b/docs/datapull/all.data_schema.explanation.md index e88a73eba3..04f15aaa07 100644 --- a/docs/datapull/all.data_schema.explanation.md +++ b/docs/datapull/all.data_schema.explanation.md @@ -1,4 +1,4 @@ - +# Data Schema @@ -10,7 +10,7 @@ -# Data schema +## Data schema - The `dataset_schema` is a structured representation of metadata attributes used to describe a dataset @@ -23,7 +23,7 @@ - This structured representation facilitates easy understanding and organization of dataset metadata, enabling efficient data management and analysis. -## Dataset schema +### Dataset schema - The data schema signature has the following schema ``` @@ -36,7 +36,7 @@ - `periodic_daily.airflow.archived_200ms.postgres.bid_ask.spot.v7.ccxt.binance.v1_0_0` - `realtime.airflow.downloaded_200ms.postgres.bid_ask.futures.v7_4.ccxt.cryptocom.v1_0_0` -## Description of fields +### Description of fields - `download_mode`: Indicates the mode in which the dataset was downloaded. - E.g., `bulk`, `realtime` and `periodic_daily` @@ -69,7 +69,7 @@ - `version`: Denotes the version of the dataset - E.g., `v1_0_0` -# Data signature validation +## Data signature validation Perform syntactic and semantic validation of a specified dataset signature. Signature is validated by the latest dataset schema version. @@ -84,7 +84,7 @@ Signature is validated by the latest dataset schema version. `{data_type}.{asset_type}` `ohlcv.futures` is a valid signature, but `bidask.futures` is not. -# Code +## Code - The code corresponding to parsing and validating is under `//data_schema/` ``` diff --git a/docs/datapull/all.datapull_client_stack.explanation.md b/docs/datapull/all.datapull_client_stack.explanation.md new file mode 100644 index 0000000000..2c3654598e --- /dev/null +++ b/docs/datapull/all.datapull_client_stack.explanation.md @@ -0,0 +1,239 @@ + + + + +- [Data client stack](#data-client-stack) + * [Interfaces](#interfaces) + * [Transformations](#transformations) + + [Output format of `ImClient`](#output-format-of-imclient) + + [Transformations by classes derived from `MarketData`](#transformations-by-classes-derived-from-marketdata) + + [Transformations by abstract class `MarketData`](#transformations-by-abstract-class-marketdata) + + [Output format of `MarketData`](#output-format-of-marketdata) + * [Asset ids format](#asset-ids-format) + + [`ImClient` asset ids](#imclient-asset-ids) + + [`MarketData` asset ids](#marketdata-asset-ids) + + [Handling of `asset_ids`](#handling-of-asset_ids) + * [Data](#data) + + [Handling of filtering by time](#handling-of-filtering-by-time) + + [Handling timezone](#handling-timezone) + + + +# Data client stack + +As said in other documents, the data is downloaded and saved by `DataPull` with +minimal or no transformation. Once the data is downloaded, it needs to be +retrieved for processing in a common format (e.g., `DataPull` format). + +We use a two-layer approach to handle the complexity of reading and serving the +data to clients. + +```mermaid +flowchart + Vendor Data --> ImClient --> MarketData --> User +``` + +- `ImClient` + - Is data vendor and dataset specific + - Adapt data from the vendor data to a standard internal `MarketData` format + - Handle all the peculiarities in format and semantic of a specific vendor + data + - All timestamps are UTC + - Asset ids are handled as strings + +- `MarketData` + - Is independent of the data vendor + - Implement behaviors that are orthogonal to vendors, such as: + - Streaming/real-time or batch/historical + - Time-stitching of streaming/batch data, i.e., merge multiple data sources + giving a single and homogeneous view of the data + - E.g., the data from the last day comes from a real-time source while the + data before that can come from an historical source. The data served by + `MarketData` is a continuous snapshot of the data + - Replaying, i.e., serialize the data to disk and read it back, implementing + as-of-time semantic based on knowledge time + - This behavior is orthogonal to streaming/batch and stitching, i.e., one + can replay any `MarketData`, including an already replayed one + - Data is accessed based on intervals `[start_timestamp, end_timestamp]` using + different open/close semantics, but always preventing future peeking + - Support real-time behaviors, such as knowledge time, wall clock time, and + blocking behaviors (e.g., "is the last data available?") + - Handle desired timezone for timestamps + - Asset ids are handled as ints + +## Interfaces + +- Both `ImClient` and `MarketData` have an output format that is enforced by the + base abstract class and the derived classes together + +- `ImClient` and `MarketData` have 3 interfaces each: + + 1. An external "input" format for a class + - Format of the data as input to a class derived from `MarketData`/`ImClient` + + 2. An internal "input" format + - It's the format that derived classes need to adhere so that the base class + can do its job, i.e., apply common transformations to all classes + + 3. An external "output" format + - It's the `MarketData`/`ImClient` format, which is fixed + +## Transformations + +- The chain of transformations of the data from `Vendor` to `User` are as + follow: + + ```mermaid + flowchart + Vendor --> DerivedImClient --> AbstractImClient --> DerivedMarketData --> AbstractMarketData --> User + ``` + +- Classes derived from `ImClient` + - The transformations are vendor-specific + - Only derived classes `ImClient` know what is exact semantic of the + vendor-data + - Whatever is needed to transform the vendor data into the internal format + accepted by base `ImClient` + +- Abstract class `ImClient` + - The transformations are fixed + - Implemented by `ImClient._apply_im_normalization()` + +- Class derived from `MarketData` + - The transformations are specific to the `MarketData` derived class + +- `MarketData` + - The transformations are fixed + +### Output format of `ImClient` + +- The data in output of a class derived from `ImClient` is normalized so that: +- The index: + - Represents the knowledge time + - Is the end of the sampling interval + - Is called `timestamp` + - Is a tz-aware timestamp in UTC + +- The data: + - (optional) Is re-sampled on a 1 minute grid and filled with NaN values + - Is sorted by index and `full_symbol` + - Is guaranteed to have no duplicates + - Belongs to intervals like `[a, b]` + - Has a `full_symbol` column with a string representing the canonical name of + the instrument + +- An example of data in output from an `ImClient` is: + ``` + full_symbol close volume + timestamp + 2021-07-26 13:42:00+00:00 binance:BTC_USDT 47063.51 29.403690 + 2021-07-26 13:43:00+00:00 binance:BTC_USDT 46946.30 58.246946 + 2021-07-26 13:44:00+00:00 binance:BTC_USDT 46895.39 81.264098 + ``` + +- TODO(gp): We are planning to use an `ImClient` data format closer to + `MarketData` by using `start_time`, `end_time`, and `knowledge_time` since + these can be inferred only from the vendor data semantic + +### Transformations by classes derived from `MarketData` + +- Classes derived from `MarketData` do whatever they need to do in `_get_data()` + to get the data, but always pass back data that: + - Is indexed with a progressive index + - Has `asset`, `start_time`, `end_time`, `knowledge_time` + - `start_time`, `end_time`, `knowledge_time` are timezone aware + +- E.g., + ``` + asset_id start_time end_time close volume + idx + 0 17085 2021-07-26 13:41:00+00:00 2021-07-26 13:42:00+00:00 148.8600 400176 + 1 17085 2021-07-26 13:30:00+00:00 2021-07-26 13:31:00+00:00 148.5300 1407725 + 2 17085 2021-07-26 13:31:00+00:00 2021-07-26 13:32:00+00:00 148.0999 473869 + ``` + +### Transformations by abstract class `MarketData` + +- The transformations are done inside `get_data_for_interval()`, during + normalization, and are: + - Indexing by `end_time` + - Converting `end_time`, `start_time`, `knowledge_time` to the desired + timezone + - Sorting by `end_time` and `asset_id` + - Applying column remaps + +### Output format of `MarketData` + +- The abstract base class `MarketData` normalizes the data by: + - Sorting by the columns that correspond to `end_time` and `asset_id` + - Indexing by the column that corresponds to `end_time`, so that it is + suitable to DataFlow computation + +- E.g., + ``` + asset_id start_time close volume + end_time + 2021-07-20 09:31:00-04:00 17085 2021-07-20 09:30:00-04:00 143.990 1524506 + 2021-07-20 09:32:00-04:00 17085 2021-07-20 09:31:00-04:00 143.310 586654 + 2021-07-20 09:33:00-04:00 17085 2021-07-20 09:32:00-04:00 143.535 667639 + ``` + +## Asset ids format + +### `ImClient` asset ids + +- `ImClient` uses assets encoded as `full_symbols` strings + - E.g., `binance::BTC_UTC` +- There is a vendor-specific mapping: + - From `full_symbols` to corresponding data + - From `asset_ids` (ints) to `full_symbols` (strings) +- If the `asset_ids` -> `full_symbols` mapping is provided by the vendor, then + we reuse it +- Otherwise, we build a mapping hashing `full_symbols` strings into numbers + +### `MarketData` asset ids + +- `MarketData` and everything downstream uses `asset_ids` that are encoded as + ints + - This is because we want to use ints and not strings in dataframe + +### Handling of `asset_ids` + +- Different implementations of `ImClient` backing a `MarketData` are possible, + e.g.: + +- The caller needs to specify the requested `asset_ids` +- In this case the universe is provided by `MarketData` when calling the data + access methods +- The reading backend is initialized with the desired universe of assets and + then `MarketData` just uses or subsets that universe + +- For these reasons, assets are selected at 3 different points: + + 1. `MarketData` allows to specify or subset the assets through `asset_ids` + through the constructor + 2. `ImClient` backends specify the assets returned + - E.g., a concrete implementation backed by a DB can stream the data for its + entire available universe + + 3. Certain class methods allow querying data for a specific asset or subset of + assets + +- For each stage, a value of `None` means no filtering + +## Data + +### Handling of filtering by time + +- Clients of `MarketData` might want to query data by: +- Using different interval types, namely `[a, b), [a, b], (a, b], (a, b)` +- Filtering on either the `start_ts` or `end_ts` +- For this reason, this class supports all these different ways of providing + data +- `ImClient` has a fixed semantic of the interval `\[a, b\]` +- `MarketData` adapts the fixed semantic to multiple ones + +### Handling timezone + +- `ImClient` always uses UTC as output +- `MarketData` adapts UTC to the desired timezone, as requested by the client diff --git a/docs/datapull/all.datapull_derived_data.explanation.md b/docs/datapull/all.datapull_derived_data.explanation.md index 8b2a63dabb..1b38c6ffd9 100644 --- a/docs/datapull/all.datapull_derived_data.explanation.md +++ b/docs/datapull/all.datapull_derived_data.explanation.md @@ -1,4 +1,12 @@ -## Derived data workflows + + + + +- [Derived data workflows](#derived-data-workflows) + + + +# Derived data workflows **Derived data workflows**. Data workflows can transform datasets into other datasets diff --git a/docs/datapull/all.datapull_sandbox.explanation.md b/docs/datapull/all.datapull_sandbox.explanation.md index 0ce224db52..ce2812eacc 100644 --- a/docs/datapull/all.datapull_sandbox.explanation.md +++ b/docs/datapull/all.datapull_sandbox.explanation.md @@ -1,9 +1,18 @@ -## Sandbox + + + + +- [DataPull sandbox](#datapull-sandbox) + + + +# DataPull sandbox This paragraph describes an example of infrastructure that implements the `DataPull` protocol. It is implemented as a Docker Container containing the following services: + - Airflow - Jupyter notebook - Postgres diff --git a/docs/datapull/all.dataset_onboarding_checklist.reference.md b/docs/datapull/all.dataset_onboarding_checklist.reference.md index 08b09e0feb..8a3d73121c 100644 --- a/docs/datapull/all.dataset_onboarding_checklist.reference.md +++ b/docs/datapull/all.dataset_onboarding_checklist.reference.md @@ -7,6 +7,11 @@ * [Implement historical downloader](#implement-historical-downloader) * [Automated AKA Scheduled downloader](#automated-aka-scheduled-downloader) * [Quality Assurance](#quality-assurance) + + [1. Check for Existing QA DAGs](#1-check-for-existing-qa-dags) + + [2. Create a New QA DAG (if necessary)](#2-create-a-new-qa-dag-if-necessary) + - [2.1. Create and Test QA Notebook](#21-create-and-test-qa-notebook) + - [2.2. Run QA Notebook via Invoke Command](#22-run-qa-notebook-via-invoke-command) + + [2.3. Create a New DAG File](#23-create-a-new-dag-file) @@ -92,13 +97,34 @@ From `docs/datapull/all.dataset_onboarding_checklist.reference.md` ## Quality Assurance -- [ ] If a QA flow for a similar data type exists, evaluate if it can be - directly re-used by simply adding a new Airflow DAG/task. If it's - insufficient, file an issue to add a new QA check/flow (using a Jupyter - Notebook) -- [ ] Schedule the QA flow to Airflow by choosing one of the following options: - - Creating a new DAG - - Extending existing DAG to include a new task (preferred, if no large - modifications are needed) +### 1. Check for Existing QA DAGs + +- [ ] **Verify if there is already a similar QA DAG running.** + - [ ] Check for existing QA DAGs (e.g., bid_ask/OHLCV, Cross QA for OHLCV + comparing real-time with historical data). + - [ ] Action: If the new QA is just a change in the universe or vendor, append + a new task to the existing running DAGs. Reference: + [Link to Relevant Section](https://github.com/cryptokaizen/cmamp/blob/6f6feec46704c96b9929fb174e6d66f7e94e6776/docs/datapull/ck.create_airflow_dag.tutorial.md?plain=1#L219)]. + +### 2. Create a New QA DAG (if necessary) + +#### 2.1. Create and Test QA Notebook + +- [ ] **Develop a notebook to test the QA process.** + - [ ] Test over a small period to ensure it functions as expected. + - [ ] Tip: Use a small dataset or limited time frame for quick testing. + +#### 2.2. Run QA Notebook via Invoke Command + +- [ ] **Execute the QA notebook using the invoke command to validate + functionality.** + - [ ] Example: + [Invoke Command Example](https://github.com/cryptokaizen/cmamp/blob/6f6feec46704c96b9929fb174e6d66f7e94e6776/dev_scripts/lib_tasks_data_qa.py#L266) + +### 2.3. Create a New DAG File + +- [ ] **Create a new DAG file after QA process validation.** + - [ ] Follow the standard procedure for DAG creation. Reference: + [DAG Creation Tutorial](https://github.com/cryptokaizen/cmamp/blob/6f6feec46704c96b9929fb174e6d66f7e94e6776/docs/datapull/ck.create_airflow_dag.tutorial.md). Last review: GP on 2024-04-20 diff --git a/docs/datapull/all.update_CCXT_version.how_to_guide.md b/docs/datapull/all.update_CCXT_version.how_to_guide.md index fda4039e69..d84c63eb97 100644 --- a/docs/datapull/all.update_CCXT_version.how_to_guide.md +++ b/docs/datapull/all.update_CCXT_version.how_to_guide.md @@ -1,21 +1,23 @@ +# Update Ccxt Version + -- [Testing CCXT Stability Before Docker Container Update](#testing-ccxt-stability-before-docker-container-update) - * [Steps for Performing CCXT API Tests:](#steps-for-performing-ccxt-api-tests) - * [Failure Handling:](#failure-handling) -- [Reading [CCXT Exchange Timestamp Interpretation](amp/docs/datapull/ck.ccxt_exchange_timestamp_interpretation.reference.md)](#reading-ccxt-exchange-timestamp-interpretationampdocsdatapullckccxt_exchange_timestamp_interpretationreferencemd) - * [Steps to Confirm Timestamp Representation](#steps-to-confirm-timestamp-representation) +- [Testing CCXT stability before docker container update](#testing-ccxt-stability-before-docker-container-update) + * [Steps for performing CCXT API tests:](#steps-for-performing-ccxt-api-tests) + * [Failure handling](#failure-handling) +- [Read CCXT exchange timestamp interpretation](#read-ccxt-exchange-timestamp-interpretation) + * [Steps to confirm timestamp representation](#steps-to-confirm-timestamp-representation) -# Testing CCXT stability before docker container update +## Testing CCXT stability before docker container update -In order to ensure the stability of our code following a CCXT update, a -thorough testing process is required. Prior to constructing a new container, we -will update the CCXT version locally and execute tests on the actual API to -verify the reliability of our codebase. +In order to ensure the stability of our code following a CCXT update, a thorough +testing process is required. Prior to constructing a new container, we will +update the CCXT version locally and execute tests on the actual API to verify +the reliability of our codebase. -## Steps for performing CCXT API tests: +### Steps for performing CCXT API tests: 1. Update CCXT version locally in the container using the following command: @@ -52,17 +54,18 @@ verify the reliability of our codebase. 5. Verify that all test results are marked as "green" before proceeding with the update of the Docker container. -## Failure handling +### Failure handling -In the event that any test fails to pass successfully, an issue should be -filed. The issue report must include details regarding -the failure, allowing for an accurate diagnosis of the problem. +In the event that any test fails to pass successfully, an issue should be filed. +The issue report must include details regarding the failure, allowing for an +accurate diagnosis of the problem. -# Read CCXT exchange timestamp interpretation +## Read CCXT exchange timestamp interpretation -Read [CCXT Exchange Timestamp Interpretation](amp/docs/datapull/ck.ccxt_exchange_timestamp_interpretation.reference.md) +Read +[CCXT Exchange Timestamp Interpretation](amp/docs/datapull/ck.ccxt_exchange_timestamp_interpretation.reference.md) -## Steps to confirm timestamp representation +### Steps to confirm timestamp representation In order to ensure accurate and up-to-date information regarding the interpretation of timestamps in the CCXT exchange library, follow these detailed diff --git a/docs/documentation_meta/all.architecture_diagrams.explanation.md b/docs/documentation_meta/all.architecture_diagrams.explanation.md index 90e3d7a89e..e2afb752f0 100644 --- a/docs/documentation_meta/all.architecture_diagrams.explanation.md +++ b/docs/documentation_meta/all.architecture_diagrams.explanation.md @@ -1,5 +1,7 @@ # Architecture Diagrams +## Architecture Diagrams + - [Summary](#summary) @@ -37,7 +39,7 @@ -# Summary +## Summary - We use C4 as a way to describe graphically software architecture together with some conventions @@ -45,7 +47,7 @@ - Mermaid is preferred since it can be rendered natively by GitHub - PlantUML can be rendered through some of our scripts in regular markdown -# Brief introduction to C4 +## Brief introduction to C4 - A detailed description of C4 is https://C4model.com @@ -55,7 +57,7 @@ - It maps code at various level of detail - It is useful for both software architects and developers -## Different levels of detail +### Different levels of detail - The 4 levels of detail are: 1. (System) Context system @@ -68,7 +70,7 @@ - Show how components are implemented - Represented in terms of UML class diagrams -### (System) Context (Level 1) +#### (System) Context (Level 1) - A system context describes something that delivers value to its users - Typically a system is owned by a single software development team @@ -87,7 +89,7 @@ - A system system is made up of one or more containers -### Container (Level 2) +#### Container (Level 2) - A container represents an application - E.g., @@ -119,7 +121,7 @@ - Technical people - Inside and outside of the software development team -### Component (level 3) +#### Component (level 3) - Component is a group of related functionality encapsulated behind a well-defined interface @@ -131,7 +133,7 @@ - Audience - Software architects and developers -### Code (level 4) +#### Code (level 4) - Code is the implementation of the software system - Each component can represented in terms of UML class diagrams, entity @@ -141,13 +143,13 @@ - Audience - Software architects and developers -# Our conventions for C4 diagrams +## Our conventions for C4 diagrams -## Mapping C4 and code structure +### Mapping C4 and code structure - To simplify, we map the 4 levels of C4 in the code structure -### (System) Context (Level 1) +#### (System) Context (Level 1) - = big picture of how the system interacts with users and other systems - Mapped onto a code repository @@ -155,7 +157,7 @@ - `//...` is a system providing data and analytics for commodity - `//pre-commit` is a system implementing a code linter -### Container (Level 2) +#### Container (Level 2) - = high-level software architecture and how responsibilities are split in the system @@ -166,7 +168,7 @@ - `etl3`: back-end db for time series with real-time and point-in-time semantics -### Component (Level 3) +#### Component (Level 3) - = a group of related functionality encapsulated behind a well-defined interface (e.g., collection of classes behind an interface) @@ -177,7 +179,7 @@ commodities and companies - `form8`: data pipeline processing form 8 -### Component (Level 4) +#### Component (Level 4) - = OOP classes - Typically we organize multiple related classes in files @@ -189,7 +191,7 @@ - `match_targets.py` - `normalize_table.py` -## Use classes! +### Use classes! - In order to be able to describe the system with C4 it is best to use classes to separate responsibilities and package code @@ -212,29 +214,29 @@ o(some argument).f(other arguments) ``` -## Generating class diagram +### Generating class diagram - To generate a class diagram (level 4 of C4), you can run ```bash > dev_scripts/create_class_diagram.sh ``` -# Mermaid +## Mermaid -## Class diagram +### Class diagram - See https://mermaid.js.org/syntax/classDiagram.html -## Support for C4 +### Support for C4 - Mermaid supports most features of C4 - See https://mermaid.js.org/syntax/c4.html -## Render on-line +### Render on-line - See https://mermaid.live/edit -# PlantUML +## PlantUML - Unified Modeling Language (UML) is a modeling language for software engineering to provide a standard way to visualize design of a system @@ -250,7 +252,7 @@ - The website https://structurizr.com has lots of information on using tools for C4 and lots of [examples](https://structurizr.com/share/52804/plantuml) -## PlantUML is Markdown +### PlantUML is Markdown - We use PlantUML for rendering diagrams in our documentation - For interactive use you can rely on online tools like: @@ -274,7 +276,7 @@ ``` ```` -### `render_md.py` tool +#### `render_md.py` tool - We have a `render_md.py` tool to embed images after `plantuml` section. Typical usage to insert images to the markdown file and to preview it: @@ -282,7 +284,7 @@ > render_md.py -i knowledge_graph/vendors/README.md ``` -#### How to use +##### How to use 1. Make sure `plantuml` is installed on your machine. The easiest way is to use the Docker container. All the packages typically needed for development are @@ -302,7 +304,7 @@ 3. If you want to use `open` action, make sure that your machine is able to open `.html` files in the browser. -### Our conventions +#### Our conventions - Names - Each name in mappings should be exactly the same (maybe without some invalid @@ -408,11 +410,11 @@ You can find the correspondent `architecture.md` file [here](https://github.com/.../.../blob/master/edgar/forms8/architecture.md). -## Plotting online +### Plotting online - [Plantuml on-line](https://plantuml-editor.kkeisuke.com/) -# UML - Unified Modeling Language +## UML - Unified Modeling Language - The Unified Modeling Language (UML) serves as a versatile visual modeling language designed to offer a standard way to visualize the design of a system @@ -432,7 +434,7 @@ -## UML Class Diagrams +### UML Class Diagrams - The UML Class Diagram is a graphical notation used to construct and visualize object-oriented systems @@ -444,7 +446,7 @@ - Methods - Relationships among objects -### Class Notation +#### Class Notation - A class represents a concept which encapsulates state (attributes) and behavior (methods) @@ -488,12 +490,12 @@ } ``` -### Class Relationships +#### Class Relationships - Classes can engage in multiple relationships with other classes - Relationships in UML class diagrams can be defined in several distinct types -#### Inheritance/Generalization +##### Inheritance/Generalization - It indicates that: - One of the two related classes (the subclass) is considered to be a @@ -523,7 +525,7 @@ } ``` -#### Association +##### Association - Associations are relationships between classes in a UML Class Diagram - They are represented by a solid line between classes @@ -618,7 +620,7 @@ Corporation ..|> Owner ``` -### Complete Example +#### Complete Example ```mermaid diff --git a/docs/documentation_meta/all.diataxis.explanation.md b/docs/documentation_meta/all.diataxis.explanation.md index 1401e9f00b..04844edd11 100644 --- a/docs/documentation_meta/all.diataxis.explanation.md +++ b/docs/documentation_meta/all.diataxis.explanation.md @@ -1,4 +1,4 @@ - +# Diataxis @@ -10,7 +10,7 @@ -# Diataxis: a framework to write documentation +## Diataxis: a framework to write documentation For more information look [https://diataxis.fr/](https://diataxis.fr/) @@ -23,7 +23,7 @@ There are 4 modes of documentation -# Tutorial +## Tutorial - Learning oriented - Is a playground for users to learn something about the product by completing a @@ -37,7 +37,7 @@ There are 4 modes of documentation - Example: [datapull/ck.create_airflow_dag.tutorial.md](https://github.com/cryptokaizen/cmamp/blob/master/docs/datapull/ck.create_airflow_dag.tutorial.md) -# How-to guide +## How-to guide - Goal oriented - Is a guide to complete a real-world task @@ -47,7 +47,7 @@ There are 4 modes of documentation - Example: [work_tools/all.pycharm.how_to_guide.md](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_tools/all.pycharm.how_to_guide.md) -# Reference +## Reference - Information oriented - Provide a technical description of a component/piece of infra. The emphasis is @@ -60,7 +60,7 @@ There are 4 modes of documentation - Example: [datapull/ck.ccxt_exchange_timestamp_interpretation.reference.md](https://github.com/cryptokaizen/cmamp/blob/master/docs/datapull/ck.ccxt_exchange_timestamp_interpretation.reference.md) -# Explanation +## Explanation - Understanding oriented - Is used in our documentation to explain design decisions and choices, diff --git a/docs/documentation_meta/all.gdocs.how_to_guide.md b/docs/documentation_meta/all.gdocs.how_to_guide.md index 9b075aeb48..4fa600c101 100644 --- a/docs/documentation_meta/all.gdocs.how_to_guide.md +++ b/docs/documentation_meta/all.gdocs.how_to_guide.md @@ -1,135 +1,13 @@ -# Markdown vs Google Docs -## In general -- We prefer to use Markdown for technical documentation -- We use Google for notes from meetings and research + -## Markdown pros + * [Other approaches](#other-approaches) +- [Markdown -> Gdocs](#markdown---gdocs) -- Can use vim -- Can version control -- Easy to use verbatim (e.g., typing `foobar`) -- Easy to style using pandoc -- Easy to embed code -- Easy to add Latex equations -- Easy to grep + -## Google Docs pros - -- Easy to embed figures -- Easy to collaborate -- Easy to make quick changes (instead of making a commit) -- Easy to publish (just make them public with proper permissions) -- Styling - - [https://webapps.stackexchange.com/questions/112275/define-special-inline-styles-in-google-docs](https://webapps.stackexchange.com/questions/112275/define-special-inline-styles-in-google-docs) -- Interesting add-ons: - - Enable Markdown - - Code blocks - - Use darcula, size 10 - ```python - def hello(): - print("hello") - ``` - - Auto-latex equations - -## Rules of thumb - -- Use Markdown - - If doc is going to be used as a public guideline - - If doc has mostly text, code, and formulas - - If there are notes from a book -- Use Gdoc - - If doc requires a lot of images that cannot be placed as text - - If doc is a research of an analysis - -# Google docs style conventions - -## Headings - -- We add N (where N is the heading level) `#` before the heading name, e.g., - - Heading 1: - ```markdown - # Heading 1 - ``` - - Heading 2: - ```markdown - ## Heading 2 - ``` -- The reason is that sometimes one doesn't have the time or the patience to - format things properly, so at least there is some indication of the level of - the titles -- Avoid having multiple `#` separatd by a space that sometimes appear in a - process of convertion of Gdocs to Markdown files - - _Bad_ - ```markdown - # # Heading 1 - ``` - - _Good_ - ```markdown - # Heading 1 - ``` - -## Font - -- Normal text: - - Font: Arial - - Font size: 11 -- Headings: - - Font: Arial - - Style: bold - - Font size: should be adjusted automatically when one converts “Normal text” - to “Heading N”, e.g., when converting some text of size 11 to “Heading 1” - the font sizes becomes 20 - - -# Convert between Gdocs and Markdown - -## Gdocs -> Markdown - -### Using `convert_docx_to_markdown.py` - -- This python script converts Docx to Markdown using Pandoc. -- In general, we recommend using this approach - -- Pros - - Removing artifacts with this python script, less manual work - - Best for a large document - - Handle figures -- Cons - - Need to move files - -### Process - -- Download Google document as `docx` -- Move the file in place - ```bash - > FILE_NAME=docs/dataflow/all.best_practice_for_building_dags.explanation - > mv /Users/saggese/Downloads/Blank.docx $FILE_NAME.docx - > convert_docx_to_markdown.py --docx_file $FILE_NAME.docx --md_file $FILE_NAME.md - ``` -- Convert it to markdown using `convert_docx_to_markdown.py` -- Usage: - ```bash - > convert_docx_to_markdown.py --docx_file Tools_Docker.docx --md_file Tools_Docker.md - ``` - - This command should be run directly under the target output directory for - the Markdown file, in order to generate correct image links. Otherwise, - you'll need to manually fix the image links. - - File names can't contain any spaces. Therefore, use underscores `_` to - replace any spaces. - -### Cleaning up converted markdown - -- Fix some formatting manually before running the Markdown linter. - - Read through [Style and cosmetic lints](#style-and-cosmetic-lints) for - Markdown formatting and fix some formatting based on the rules. - - Summary - - Add the following tag at the top of the markdown file below the document - title: - ```markdown - - ``` +``` - Use bullet lists to organize the whole Markdown for consistency with other docs. See [all.coding_style.how_to_guide.md](https://github.com/cryptokaizen/cmamp/blob/master/docs/coding/all.coding_style.how_to_guide.md) @@ -175,7 +53,7 @@ - Move the gdoc to the [\_OLD directory](https://drive.google.com/drive/u/0/folders/1J4B1vq8EwT-q_z7qSLCZ9Tug2CA9f8i7) -### Other approaches +#### Other approaches - Best for a large document - Approach 1 - Chrome Docs to Markdown extension: @@ -195,7 +73,7 @@ [Cleaning up converted markdown](#cleaning-up-converted-markdown) - You might need to remove artifacts manually -## Markdown -> Gdocs +### Markdown -> Gdocs - Approach 1: - Run diff --git a/docs/documentation_meta/all.google_technical_writing.how_to_guide.md b/docs/documentation_meta/all.google_technical_writing.how_to_guide.md index 4715ad7498..036098a270 100644 --- a/docs/documentation_meta/all.google_technical_writing.how_to_guide.md +++ b/docs/documentation_meta/all.google_technical_writing.how_to_guide.md @@ -1,16 +1,39 @@ +# Google Technical Writing + + +- [Google's technical writing: Part 1](#googles-technical-writing-part-1) + * [Define new or unfamiliar](#define-new-or-unfamiliar) + * [Use terms consistently](#use-terms-consistently) + * [Use acronyms properly](#use-acronyms-properly) + * [Use strong verbs](#use-strong-verbs) + * [Use short sentences](#use-short-sentences) + * [Remove fillers](#remove-fillers) + * [Focus each paragraph on a single topic](#focus-each-paragraph-on-a-single-topic) + * [Avoid wall-of-text](#avoid-wall-of-text) + * [Answer what, why, and how](#answer-what-why-and-how) + * [Know your audience](#know-your-audience) + * [State document's scope](#state-documents-scope) + * [Summarize the key points at the start](#summarize-the-key-points-at-the-start) +- [Google's technical writing: Part 2](#googles-technical-writing-part-2) + * [Adopt a style guide](#adopt-a-style-guide) + * [Think like your audience](#think-like-your-audience) + * [Come back to it later](#come-back-to-it-later) + * [Organizing large docs](#organizing-large-docs) +- [Resources](#resources) + // From https://developers.google.com/tech-writing/one/ -# Google's technical writing: Part 1 +## Google's technical writing: Part 1 -## Define new or unfamiliar +### Define new or unfamiliar - If your document introduces a term, define the term - If the term already exists, link to a good existing explanation -## Use terms consistently +### Use terms consistently - Don't change the name of something while talking about it - E.g., `Protocol Buffers` vs `protobufs` @@ -19,7 +42,7 @@ Protocol Buffers (or protobufs for short) ``` -## Use acronyms properly +### Use acronyms properly - On the initial use of an unfamiliar acronym spell out the full term - E.g., `Telekinetic Tactile Network (TTN) ...` @@ -28,7 +51,7 @@ - An acronym should be significantly shorter than the full term - Don't define acronyms that will be used only a few times -## Use strong verbs +### Use strong verbs - Choose precise, strong, and specific verbs - Weak verbs are "be", "occur", "happen" @@ -43,14 +66,14 @@ Dividing by zero raises the exception. The exception occurs when dividing by zero. ``` -## Use short sentences +### Use short sentences - Each sentence should convey a single idea, thought, concept - Break long sentences into single-idea sentences - Convert long sentences into bulleted list - E.g., "and", "or" suggest to refactor into a bulleted list -## Remove fillers +### Remove fillers **Good** ``` @@ -62,17 +85,17 @@ This design document describes Project Frambus. This design document provides a detailed description of Project Frambus. ``` -## Focus each paragraph on a single topic +### Focus each paragraph on a single topic - A paragraph is an independent unit of logic - Ruthlessly delete sentence that doesn't relate to the current topic -## Avoid wall-of-text +### Avoid wall-of-text - Readers often ignore long paragraphs - Paragraphs should contain 3 to 5 sentences -## Answer what, why, and how +### Answer what, why, and how - Good paragraphs answer the following questions - What: what are you trying to tell your reader? @@ -90,7 +113,7 @@ This design document provides a detailed description of Project Frambus. `garp()` value suggests that the mean is more meaningful than when the `garp()` value is relatively high. -## Know your audience +### Know your audience - Your document needs to provide information that your audience needs but doesn't already have @@ -110,7 +133,7 @@ This design document provides a detailed description of Project Frambus. - Avoid the "curse of knowledge": experts forget that novices don't know what you already know -## State document's scope +### State document's scope - A good document begins by defining its scope and its non-scope, e.g., ``` @@ -118,7 +141,7 @@ This design document provides a detailed description of Project Frambus. technology Froobus. ``` -## Summarize the key points at the start +### Summarize the key points at the start - Ensure that the start of your document answers your readers' essential questions @@ -126,28 +149,32 @@ This design document provides a detailed description of Project Frambus. // From https://developers.google.com/tech-writing/two -# Google's technical writing: Part 2 +## Google's technical writing: Part 2 + +### Adopt a style guide -## Adopt a style guide - Many companies and large open source projects adopt a style guide for documentation - E.g., https://developers.google.com/style -## Think like your audience +### Think like your audience + - Step back and try to read your draft from the point of view of your audience -## Come back to it later +### Come back to it later + - After you write your first (or second or third) draft, set it aside - Come back later and read it with fresh eyes to find things you can improve -## Organizing large docs +### Organizing large docs + - You can organize a collection of information into - - a longer standalone document; or - - set of shorter interconnected documents (e.g., website, wiki) + - A longer standalone document; or + - Set of shorter interconnected documents (e.g., website, wiki) - Pros: easy to find information searching in the single back // TODO -# Resources +## Resources - [https://developers.google.com/tech-writing/overview] diff --git a/docs/documentation_meta/all.writing_docs.how_to_guide.md b/docs/documentation_meta/all.writing_docs.how_to_guide.md index efc6a38ae6..b9b7a5283a 100644 --- a/docs/documentation_meta/all.writing_docs.how_to_guide.md +++ b/docs/documentation_meta/all.writing_docs.how_to_guide.md @@ -1,20 +1,22 @@ +# Writing Docs + -# Conventions +## Conventions -## Make no assumptions on the user's knowledge +### Make no assumptions on the user's knowledge - Nothing is obvious to somebody who doesn't know -## Verify that things worked +### Verify that things worked - Add ways to verify if a described process worked - E.g., "do this and that, if this and that is correct should see this" - Have a trouble-shooting procedure - One approach is to always start from scratch -## Always use the linter +### Always use the linter - Most cosmetic lints described further can be taken care automatically by our markdown linter, so make sure to run it after implementing the changes @@ -24,7 +26,7 @@ - If the linter messes up the text, file an issue with examples of what the linter does incorrectly -## Add a table of content +### Add a table of content - Unfortunately both markdown and GitHub don't support automatically generating a TOC for a document @@ -35,13 +37,19 @@ ``` - Run `i lint` to build the TOC automatically +## + +- Make sure the headings structure contains exactly one level 1 heading + (`# This one`) + - This is important for displaying MkDocs documentation correctly via browser + ## Use 80 columns formatting for md files - Our markdown linter takes care of reflowing the text - Vim has a `:gq` command to reflow the comments - There are plugins for PyCharm and VisualStudio -## Use good vs bad +### Use good vs bad - Make examples of "good" ways of doing something and contrast them with "bad" ways @@ -58,7 +66,7 @@ ... ``` -## Use an empty line after heading +### Use an empty line after heading - Leave an empty line after a heading to make it more visible, e.g., @@ -78,7 +86,7 @@ - Our linter automatically takes care of this -## Bullet lists +### Bullet lists - We like using bullet list since they represent the thought process, force people to focus on short sentences (instead of rambling wall-of-text), and @@ -94,7 +102,7 @@ - We use `-` instead of `*` or circles - The linter automatically enforces this -## Use the right syntax highlighting +### Use the right syntax highlighting - When using a block of code use the write syntax highlighting - Code (```python) @@ -119,18 +127,18 @@ .... ``` -## Indent `code` style +### Indent `code` style - GitHub / Pandoc seems to render incorrectly a code block unless it's indented over the previous line -## Embed screenshots only when strictly necessary +### Embed screenshots only when strictly necessary - Avoid to use screenshots whenever possible and use copy-paste of text with the right highlighting - However, sometimes we need to use screenshots (e.g., plots, website interface) -## Improve your written English +### Improve your written English - Use English spell-checker, but unfortunately this is not enough - Type somewhere where you can use several choices: @@ -141,7 +149,7 @@ correction - Otherwise you will keep making the same mistakes forever -## Make sure your markdown looks good +### Make sure your markdown looks good - Compare your markdown with other already published @@ -149,18 +157,18 @@ - Check in the code a branch and use GitHub to render it - Use Pycharm to edit, which also renders it side-by-side -## Do not overcapitalize headings +### Do not overcapitalize headings - Paragraph titles should be like `Data schema` not `Data Schema` -## Update the `Last review` tag +### Update the `Last review` tag - When you read/refresh a file update the last line of the text ```verbatim Last review: GP on 2024-04-20, Paul on 2024-03-10 ``` -## Comment the code structure +### Comment the code structure - When you want to describe and comment the code structure do something like this @@ -182,7 +190,7 @@ Script to test a schema ``` -## Convention for file names +### Convention for file names - Each file name should have a format like `docs/{component}/{audience}.{topic}.{diataxis_tag}.md` @@ -198,7 +206,7 @@ // From https://opensource.com/article/20/3/documentation -## Use active voice +### Use active voice - Use the active voice most of th time and use the passive voice sparingly - Active voice is shorter than passive voice @@ -212,39 +220,39 @@ - There configurations can be changed by ... -## Use simple short sentences +### Use simple short sentences - Use Grammarly/ChatGPT -## Format for easy reading +### Format for easy reading - Use headings, bullet points, and links to break up information into chunks instead of long explanatory paragraphs -## Keep it visual +### Keep it visual - Use tables and diagrams, together with text, whenever possible -## Mind your spelling +### Mind your spelling - Always, always, always spell check for typos and grammar check - Use Grammarly/ChatGPT -## Be efficient +### Be efficient - Nobody wants to read meandering paragraphs in documentation - Engineers want to get technical information as efficiently as possible - Do not add "fluff" - Do not explain things in a repetitive way -## Do not add fluff +### Do not add fluff - Always point to documentation on the web instead of summarizing it - If you want to summarize some doc (e.g., so that people don't have to read too much) add it to a different document instead of mixing with our documentation - Focus on how we do, why we do, rather than writing AI-generated essays -# Resources +## Resources - [https://opensource.com/article/20/3/documentation] - [Markdown cheatsheet](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) diff --git a/docs/general_background/all.common_abbreviations.reference.md b/docs/general_background/all.common_abbreviations.reference.md index 651d8566df..3d4878af8f 100644 --- a/docs/general_background/all.common_abbreviations.reference.md +++ b/docs/general_background/all.common_abbreviations.reference.md @@ -1,4 +1,15 @@ -- -> room, -> = "let’s go to the conference room" (typically the one pinned in the chat) + + + + +- [Common abbreviations](#common-abbreviations) + + + +# Common abbreviations + +- -> room, -> = "let’s go to the conference room" (typically the one pinned in + the chat) - AFAIK = as far as I know - AFK, AFTK = away from (the) keyboard - BM = Build-meister @@ -7,7 +18,7 @@ - GH = GitHub - IMO = in my opinion - KG = knowledge graph -- KG-OG = KG Original Gangsta +- KG-OG = KG Original Gangsta - KG-fication = The process of turning something in a KG - KOTH = King Of The Hill (the best model so far) - OOO, OOTO = Out Of The Office @@ -23,6 +34,6 @@ - WIP = work in progress - Windows = the worst OS ever made - ZH = ZenHub -- np = no problem -- sg = sounds good -- vim = the best editor ever made +- Np = no problem +- Sg = sounds good +- Vim = the best editor ever made diff --git a/docs/general_background/all.glossary.reference.md b/docs/general_background/all.glossary.reference.md index 6cd69b0435..57c5075936 100644 --- a/docs/general_background/all.glossary.reference.md +++ b/docs/general_background/all.glossary.reference.md @@ -27,25 +27,25 @@ - HLD (High Level Design) - Is a general system design and includes the description of the System - architecture and design + architecture and design - IM (Instrument Master) - A software component that associates symbolic names to assets and their - prices + prices - Integrator - Someone on the team that is in charge of merging code to the main line of - development + development - Aka: master - OHLCV bar - An **open-high-low-close chart** (also **OHLC**) is a type of chart - typically used to illustrate movements in the price of a financial instrument - over time + typically used to illustrate movements in the price of a financial + instrument over time - OMS (Order Management System) - A software component in charge of placing and monitoring trading orders to - market or broker + market or broker - PR (Pull Request) - Request to merge code in GitHub diff --git a/docs/general_background/all.literature_review.reference.md b/docs/general_background/all.literature_review.reference.md index b598b9c2bd..8a765e5ce7 100644 --- a/docs/general_background/all.literature_review.reference.md +++ b/docs/general_background/all.literature_review.reference.md @@ -1,4 +1,4 @@ - +# Literature Review @@ -19,7 +19,7 @@ -## Meta +### Meta - Year - Title - Paper authors: @@ -46,10 +46,10 @@ - Read the bibliography - Try experiments -## To cut and paste +### To cut and paste ``` -## Year - Title +### Year - Title - Paper authors: - [Link]() - Review author / date: @@ -60,9 +60,9 @@ - Next steps: ``` -# News for commodity prediction +## News for commodity prediction -## 2015 - The role of news in commodity markets +### 2015 - The role of news in commodity markets - Paper authors: Borovkova - [Link](https://drive.google.com/file/d/1p3Z6W5DPBrDyTGBK__uLE2gNkQDO6VTM/view?usp=sharing) @@ -94,9 +94,9 @@ - How to deliver "event study" models to customers? Should we "unroll the model" for them providing a stream of predictions? -# Social sentiment +## Social sentiment -## 2015, Predicting global economic activity with media analytics +### 2015, Predicting global economic activity with media analytics - Paper authors: Peterson et al. - Link: In `Tech/papers` @@ -114,7 +114,7 @@ - Consider the difference in professional news vs social news sentiment - What does it mean if there are large statistically significant difference? -## 2018 - Twitter, Investor Sentiment and Capital Markets, what do we know? +### 2018 - Twitter, Investor Sentiment and Capital Markets, what do we know? - Paper authors: - Review author: GP, 2019-08-21 @@ -130,9 +130,9 @@ - Read all the bibliography and reproduce some of the results - TODO: Update this to new template -# Time series +## Time series -## On-Line Learning of Linear Dynamical Systems: Exponential Forgetting in Kalman Filters +### On-Line Learning of Linear Dynamical Systems: Exponential Forgetting in Kalman Filters - Paper authors: Mark Kozdoba, Jakub Marecek, Tigran Tchrakian, and Shie Mannor - Review author: Paul, 2019-12-02 @@ -177,7 +177,7 @@ - The proofs of the results of the paper would no longer apply - It isn't obvious how the learning rate ought to be chosen -## Predictive State Smoothing (PRESS): Scalable non-parametric regression for high-dimensional data with variable selection +### Predictive State Smoothing (PRESS): Scalable non-parametric regression for high-dimensional data with variable selection - Paper author: Georg M. Goerg - Review author: Paul, 2019-12-03 @@ -210,7 +210,7 @@ - If no implementation is available, scope out how much work a minimal pandas-compatible implementation would require -## 2019, High-Dimensional Multivariate Forecasting with Low-Rank Gaussian Copula Processes +### 2019, High-Dimensional Multivariate Forecasting with Low-Rank Gaussian Copula Processes - Paper authors: David Salinas, Michael Bohlke-Schneider, Laurent Callot, Roberto Medico, Jan Gasthaus @@ -234,7 +234,7 @@ - Use in cases where we have a large number of time series known to have meaningful correlations -## 2014, The topology of macro financial flow using stochastic flow diagrams +### 2014, The topology of macro financial flow using stochastic flow diagrams - Paper authors: Calkin, De Prado - [Link](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2379319) @@ -256,9 +256,9 @@ - Next steps: - None -# Computer engineering +## Computer engineering -## 2015, Hidden technical debt in machine learning systems +### 2015, Hidden technical debt in machine learning systems - Paper authors: Sculler et al. - [Link](http://papers.nips.cc/paper/5656-hidden-technical-debt-in-machine-learning-systems.pdf) diff --git a/docs/general_background/all.reading_list.reference.md b/docs/general_background/all.reading_list.reference.md index d40878ebd4..5afaf74baf 100644 --- a/docs/general_background/all.reading_list.reference.md +++ b/docs/general_background/all.reading_list.reference.md @@ -1,5 +1,7 @@ # Reading List +## Reading List + - [Git](#git) @@ -11,21 +13,20 @@ -# Git +## Git - [Short tutorial](https://git-scm.com/docs/gittutorial) - [Pro Git book](https://git-scm.com/book/en/v2) - To achieve mastery -# Bash / Linux +## Bash / Linux - [Short tutorial](https://www.usenix.org/sites/default/files/conference/protected-files/lisa19_maheshwari.pdf) - [Missing semester of CS](https://missing.csail.mit.edu/) -# Coding +## Coding - [The Pragmatic Programmer](https://www.amazon.com/Pragmatic-Programmer-Journeyman-Master/dp/020161622X) - - Aka the Black Book - Reading and (really) understanding this is equivalent to accumulate 20 years of coding @@ -38,14 +39,14 @@ - And, yes you are correct noticing that Joel is holding the table tennis racquet incorrectly in the picture -# Data analysis +## Data analysis - [Python for Data Analysis](https://www.amazon.com/Python-Data-Analysis-Wrangling-IPython/dp/1491957662) - Reading is not enough: you should have tried _all_ the examples of the book - Remember: whatever you want to do, there is a more effective pandas way to do it in one line -# SRE +## SRE - [Site Reliability Engineering](https://landing.google.com/sre/sre-book/toc/index.html) - "Members of the SRE team explain how their engagement with the entire @@ -53,6 +54,6 @@ maintain some of the largest software systems in the world." - An outstanding reference drawing on a wealth of experience -# Arbitrage +## Arbitrage - [Trading and Arbitrage in Cryptocurrency Markets](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3171204) diff --git a/docs/infra/all.aws_iam_configuration.explanation.md b/docs/infra/all.aws_iam_configuration.explanation.md new file mode 100644 index 0000000000..d05ce20451 --- /dev/null +++ b/docs/infra/all.aws_iam_configuration.explanation.md @@ -0,0 +1,169 @@ +# AWS IAM Configuration Best Practices Guide + + + +- [Introduction](#introduction) +- [Principle of Least Privilege (PoLP)](#principle-of-least-privilege-polp) +- [Structuring IAM Policies](#structuring-iam-policies) +- [Segmentation of Permissions](#segmentation-of-permissions) +- [Naming Conventions](#naming-conventions) + * [IAM Roles](#iam-roles) + * [IAM Groups](#iam-groups) + * [IAM Group Policies](#iam-group-policies) + * [IAM Policies](#iam-policies) + * [Example Naming Conventions](#example-naming-conventions) + - [Explanation](#explanation) + + + +## Introduction + +This document outlines the best practices for structuring, segmenting, and +naming AWS Identity and Access Management (IAM) roles, groups, and policies. +Adhering to these practices will ensure that IAM configurations remain scalable, +auditable, and aligned with the principle of least privilege (PoLP), supporting +both current needs and future expansions. + +## Principle of Least Privilege (PoLP) + +- **Definition**: Ensure that IAM entities (users, roles, groups) have only the + permissions necessary to perform their assigned tasks, no more, no less. Each + IAM policy should grant only the permissions necessary for the user or service + to perform its intended tasks. +- **Application**: Review existing permissions regularly and adjust to fit + changing requirements while adhering to this minimal access principle. Employ + conditions in policies to restrict access further, such as limiting actions to + specific IP ranges, or restricting access between environments by leveraging + tags. + +Ensure that IAM policies grant only the permissions necessary for users to +perform their job functions. + +- **Restrictive Resource Access**: Instead of granting broad permissions like + `"Resource": "*"`, specify resources more explicitly wherever possible. For + instance, avoid definitions such as `secretsmanager:*`, and `kms:*`. Consider + specifying only required actions unless absolutely necessary, and always + restrict the resources to specific ARNs when possible. +- **Action-Specific Policies**: Limit actions to those absolutely necessary. For + example, if a user or service only needs to read from an S3 bucket, they + should not have write access. +- **Condition Statements**: Use condition statements to enforce policy + application under specific circumstances, adding an additional layer of + security. + +## Structuring IAM Policies + +- **Atomic Policies**: Create policies that are specific to a single purpose or + service. +- **Minimize Wildcards**: Use specific resource ARNs instead of broad wildcards + where practical to limit access scope. +- **Use Conditions**: Apply conditions to control when and how permissions are + granted. +- **Organize Statements Logically**: Group related permissions into the same + statement where it makes sense for clarity and manageability. DRY! +- **Separate Critical and Non-critical Access**: Clearly differentiate policies + handling critical resources (like production databases) from non-critical + resources. For instance, avoid using `s3:*` permissions on non-critical + buckets; specify allowed actions. + +## Segmentation of Permissions + +Segmentation involves dividing IAM policies based on the type of access or +function they serve. This makes policies easier to manage and understand. + +- **Functional Segmentation**: Group permissions by AWS service (e.g., ECR, S3, + ECS) and by the nature of access (read-only, read-write). This would make it + easier to manage and audit permissions. +- **Resource-Specific Policies**: Instead of using wildcards, specify which + resources a group or user can access. This minimizes the risk of unintentional + access. +- **Environment Segmentation**: Differentiate between production and + non-production environments within policies to prevent accidental access to + critical resources. +- **Role-Based Access Control (RBAC)**: Assign users to groups based on their + job function and assign policies to these groups. +- **Temporary Credentials**: Use roles and temporary credentials for short-term + access, minimizing long-term security risks. + +## Naming Conventions + +Clear naming conventions help in quickly identifying the purpose of a policy, +which resources it relates to, and the permissions level it grants. + +1. **Environment Prefix**: Use prefixes such as `Dev`, `Preprod`, or `Prod` to + indicate the environment. +2. **Service or Functional Descriptors**: Include the AWS service or the + function (e.g., `ECR`, `S3`) in the policy name. +3. **Access Level**: Specify the access level (e.g., `ReadOnly`, `ReadWrite`) in + the policy name. +4. **Resource Type or Identifier**: Where applicable, include a resource + identifier to specify the scope, to provide additional context about the type + or specific identifiers of resources involved (e.g., `DataBuckets`). + +### IAM Roles + +IAM roles should clearly reflect the service and purpose they are designed to +support. + +- **Format**: `[Environment][Service][Purpose]Role` +- **Example for EC2**: `ProdEC2InstanceManagementRole` +- **Example for EKS**: `PreprodEKSClusterAdminRole` + +This format identifies the environment (Prod, Preprod), the AWS service (EC2, +EKS), the role's purpose (InstanceManagement, ClusterAdmin), and it ends with +the word "Role" to distinguish it as an IAM role. + +### IAM Groups + +Groups often represent a collection of users with similar permissions. Names +should reflect the organizational units (OUs) or user role they are intended +for: + +- **Format**: `[userRole]-[permissionLevel]-group` +- **Example for Developer**: `developer-limited-group` +- **Example for DevOps**: `devops-extended-group` + +This convention highlights the role (e.g., Developer, DevOps) and the level of +privilege (e.g., 'Limited' for basic access, 'Extended' for broader permissions, +'Custom' for specially crafted permissions), which are crucial for understanding +what the users in the group can do. + +### IAM Group Policies + +Group policies should be named similarly to individual IAM policies but should +indicate they are associated with a group. + +- **Format**: + `[Environment][Service][AccessLevel][ResourceIdentifier]GroupPolicy` +- **Example for S3 access**: `PreprodS3ReadOnlyDataBackupGroupPolicy` +- **Example for ECS access**: `DevECSReadOnlyServicesGroupPolicy` + +This naming convention makes it clear which environment the policy applies to, +what service it pertains to, the level of access provided, and that it is a +group policy. + +### IAM Policies + +For IAM policies that apply to specific services, the name should indicate the +environment, service, and scope of access. + +- **Format**: `[Environment][Service][AccessLevel][ResourceIdentifier]Policy` +- **Example for DynamoDB**: `PreprodDynamoDBReadWriteTablePolicy` +- **Example for IAM access**: `ProdIAMFullAccessUserPolicy` +- **Example for EC2 access**: `ProdEC2ReadOnlyAirflowPolicy` + +### Example Naming Conventions + +Here’s an example naming convention for an IAM policy intended for the +development environment, with read-only access to S3 Data Buckets: + +- `DevS3ReadOnlyDataBucketsPolicy` + +##### Explanation + +- `Dev` indicates that this policy is intended for use in the development + environment. +- `S3` specifies that the policy pertains to Amazon S3 service. +- `ReadOnly` clearly states the permission level, which is read-only access. +- `DataBuckets` tells us that the policy is specifically for actions related to + data storage buckets. diff --git a/docs/infra/all.aws_scripts.reference.md b/docs/infra/all.aws_scripts.reference.md index a37a6e4d41..df943bb020 100644 --- a/docs/infra/all.aws_scripts.reference.md +++ b/docs/infra/all.aws_scripts.reference.md @@ -1,3 +1,12 @@ + + + + +- [Python Scripts Guide](#python-scripts-guide) + + + # Python Scripts Guide -Documentation of the AWS scripts available at: https://drive.google.com/drive/u/3/folders/1hAB_lhAvL69pnK2cKYG5YRHsInru0mk0 +Documentation of the AWS scripts available at: +https://drive.google.com/drive/u/3/folders/1hAB_lhAvL69pnK2cKYG5YRHsInru0mk0 diff --git a/docs/infra/all.infrastructure_glossary.explanation.md b/docs/infra/all.infrastructure_glossary.explanation.md index b574a60713..586a1df180 100644 --- a/docs/infra/all.infrastructure_glossary.explanation.md +++ b/docs/infra/all.infrastructure_glossary.explanation.md @@ -1,4 +1,4 @@ - +# Infrastructure Glossary @@ -16,6 +16,7 @@ * [Elastic Kubernetes Service (EKS)](#elastic-kubernetes-service-eks) * [Elastic File System (EFS)](#elastic-file-system-efs) * [Identity and Access Management (IAM)](#identity-and-access-management-iam) + * [CloudWatch](#cloudwatch) @@ -23,9 +24,9 @@ infrastructure - We refer to further docs for an in-depth analysis -# General technologies +## General technologies -## Kubernetes +### Kubernetes - Platform to automate deploying, scaling, and operating containers across a cluster of machines @@ -38,7 +39,7 @@ - Horizontal scaling: up and down based on commands, UI, or on resource usage - Automated rollouts and rollbacks -## Terraform +### Terraform - Tool for building, changing, and versioning infrastructure - IaC: users define and provide infrastructure in a configuration file @@ -50,7 +51,7 @@ - Show a preview of what happens when you modify infrastructure before applying the changes -## Ansible +### Ansible - Automation tool for configuration management, application deployment, and service provisioning @@ -61,7 +62,7 @@ - Support multi-node deployments - Work against multiple systems using a list (aka "inventory") -## Airflow +### Airflow - Tool to schedule and monitor workflows automatically - Workflows are set up as DAGs to reflect dependencies @@ -71,7 +72,7 @@ - Rich web UI to visualize pipelines, monitor progress, troubleshoot issues - Many plugins to manage workflows spanning many systems -## Zabbix +### Zabbix - Open-source monitoring solution for network and applications - Monitor applications metrics, processes, and performance indicators @@ -81,17 +82,17 @@ - Performance based visualization - Agent and agent-less monitoring -## Prometheus +### Prometheus - Monitoring and alerting toolkit -## Helm +### Helm -## eksctl +### eksctl -# Amazon Web Services (AWS) +## Amazon Web Services (AWS) -## Virtual private cloud (VPC) +### Virtual private cloud (VPC) - Provide logically isolated sections of AWS cloud - You can launch AWS resources @@ -103,7 +104,7 @@ - Create a private-facing subnet (with no Internet access) for your backend systems -## Elastic Kubernetes Service (EKS) +### Elastic Kubernetes Service (EKS) - Run K8 on AWS without installing and maintain K8 control plane or nodes - Integrates with EC2, IAM, VPC @@ -111,7 +112,7 @@ - Provide managed node groups to automate provisioning and lifecycle management of nodes (EC2) -## Elastic File System (EFS) +### Elastic File System (EFS) - Scalable, cloud-native file storage service - Fully managed service @@ -124,15 +125,17 @@ - Store data across multiple availability zones in AWS region - Different performance modes (general purpose or max I/O) -## Identity and Access Management (IAM) +### Identity and Access Management (IAM) - Create users and groups permissions to access and deny access to AWS resources - Permanent or temporary credentials (e.g., expire after a certain duration) - Multi-factor authentication (besides username and password) - Can test and validate effects of IAM policies changes before applying them -## CloudWatch -- See https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/WhatIsCloudWatch.html +### CloudWatch + +- See + https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/WhatIsCloudWatch.html - CloudWatch monitors your AWS resources and the applications you run on AWS in real-time diff --git a/docs/infra/all.infrastructure_update_rollout.how_to_guide.md b/docs/infra/all.infrastructure_update_rollout.how_to_guide.md index b423e4ca0f..75edb6d23a 100644 --- a/docs/infra/all.infrastructure_update_rollout.how_to_guide.md +++ b/docs/infra/all.infrastructure_update_rollout.how_to_guide.md @@ -28,23 +28,19 @@ consistently, and safely. ### Phase 1: Planning - [ ] **Issue Creation**: - - File a GitHub issue detailing the purpose of the update, specific versions we plan to update, and the specific benefits we aim to achieve. - [ ] **Identify Update Requirements**: - - Determine which components need updates and the reasons (security updates, feature enhancements, bug fixes). - [ ] **Review Update Documentation**: - - Study the release notes and documentation of the updates to understand the changes, new features, new dependencies, breaking changes, and potential impacts. - [ ] **Stakeholder Communication**: - - Inform relevant stakeholders about the planned update and gather any initial feedback or concerns. @@ -55,53 +51,43 @@ consistently, and safely. ### Phase 2: Testing - [ ] **Environment Setup**: - - Set up testing environments that closely mirror production systems. - [ ] **Apply Updates**: - - Deploy the updates in the testing environment. - [ ] **Automated Testing**: - - Run automated regression and new feature tests to ensure nothing breaks with the new updates. - [ ] **Manual Testing**: - - Conduct thorough manual testing to check for issues not covered by automated tests. - [ ] **Performance Benchmarking**: - - Compare performance metrics before and after the update to ensure no degradation. - [ ] **Security Assessment**: - - Perform security audits on the updated components to ensure no new vulnerabilities are introduced. ### Phase 3: Pre-Deployment - [ ] **Final Review Meeting**: - - Conduct a meeting with relevant stakeholders to review and decide whether to proceed with the deployment to production. - [ ] **Backup Production Data**: - - Ensure that all relevant production data is backed up and restore points are created. - [ ] **Rollback Plan**: - - Prepare detailed rollback procedures in case the update needs to be reversed. (Utilize Kubernetes’ capabilities for a seamless rollback, which can be executed with minimal to no downtime.) - [ ] **Stakeholder Announcement**: - - Ensure all stakeholders are informed about the upcoming update and its implications. Issue a notification via Telegram and email detailing the scope of the update, deployment start time, and expected downtime duration @@ -111,34 +97,28 @@ consistently, and safely. ### Phase 4: Deployment - [ ] **Phased Rollout**: - - If applicable, roll out the update incrementally (canary release, blue-green deployments). - [ ] **Monitoring**: - - Closely monitor the system for any immediate issues during and after the deployment. - [ ] **Stakeholder Update**: - - Keep stakeholders updated on the deployment status and any critical issues. ### Phase 5: Post-Deployment - [ ] **Post-Deployment Testing and Monitoring**: - - Conduct additional testing to ensure the system operates as expected in the production environment. Closely monitor the system after deployment for any unforeseen issues. - [ ] **Performance Monitoring**: - - Monitor system performance over time to catch any delayed effects of the update. - [ ] **Issue Log**: - - Document any issues encountered during deployment and how they were resolved. diff --git a/docs/kaizenflow/all.dev_scripts_catalogue.reference.md b/docs/kaizenflow/all.dev_scripts_catalogue.reference.md index 8a4b1dccbf..bf6cadd3e8 100644 --- a/docs/kaizenflow/all.dev_scripts_catalogue.reference.md +++ b/docs/kaizenflow/all.dev_scripts_catalogue.reference.md @@ -1,120 +1,298 @@ -# `bash` -## kga -## mkbak -## path -## print_paths.sh -## timestamp -## tree.sh -# Development + -## create_class_diagram.sh -## ctags.sh -## call_graph.sh -## go_amp.sh -## setenv_amp.configure_env.sh -## setenv_amp.sh -## tmux_amp.sh -## tmux_kill_session.sh +- [Dev Scripts Catalogue](#dev-scripts-catalogue) + * [`bash`](#bash) + + [kga](#kga) + + [mkbak](#mkbak) + + [path](#path) + + [print_paths.sh](#print_pathssh) + + [timestamp](#timestamp) + + [tree.sh](#treesh) + * [Development](#development) + + [create_class_diagram.sh](#create_class_diagramsh) + + [ctags.sh](#ctagssh) + + [call_graph.sh](#call_graphsh) + + [go_amp.sh](#go_ampsh) + + [setenv_amp.configure_env.sh](#setenv_ampconfigure_envsh) + + [setenv_amp.sh](#setenv_ampsh) + + [tmux_amp.sh](#tmux_ampsh) + + [tmux_kill_session.sh](#tmux_kill_sessionsh) + * [Repo integration](#repo-integration) + + [clean_up_text_files.sh](#clean_up_text_filessh) + * [Searching](#searching) + + [ack](#ack) + + [ffind.py](#ffindpy) + + [jack](#jack) + + [jackipynb](#jackipynb) + + [jackmd](#jackmd) + + [jackmk](#jackmk) + + [jackppy](#jackppy) + + [jackpy](#jackpy) + + [jackpyc](#jackpyc) + + [jacktxt](#jacktxt) + * [Markdown](#markdown) + + [lint_md.sh](#lint_mdsh) + + [convert_gdoc_to_markdown.sh](#convert_gdoc_to_markdownsh) + + [remove_empty_lines.sh](#remove_empty_linessh) + * [`vim`](#vim) + + [traceback_to_cfile.py](#traceback_to_cfilepy) + + [viack](#viack) + + [vic](#vic) + + [vigit](#vigit) + + [vigitp](#vigitp) + + [vil](#vil) + + [vit](#vit) + + [viw](#viw) + * [To reorg](#to-reorg) + + [aws](#aws) + + [cie](#cie) + + [cleanup_scripts](#cleanup_scripts) + + [client_setup](#client_setup) + + [code_stats.sh](#code_statssh) + + [compile_all.py](#compile_allpy) + + [compress_files.sh](#compress_filessh) + + [cvxpy_setup](#cvxpy_setup) + + [diff_to_vimdiff.py](#diff_to_vimdiffpy) + + [docker_clean_all.sh](#docker_clean_allsh) + + [docker_clean_postgres.sh](#docker_clean_postgressh) + + [email_notify.py](#email_notifypy) + + [export_vars.sh](#export_varssh) + + [find_unused_golden_files.py](#find_unused_golden_filespy) + + [fix_perms.sh](#fix_permssh) + + [git](#git) + + [github](#github) + + [grsync.py](#grsyncpy) + + [helpers.sh](#helperssh) + + [infra](#infra) + + [integrate_repos](#integrate_repos) + + [lib_tasks_data_qa.py](#lib_tasks_data_qapy) + + [lib_tasks_data_reconcile.py](#lib_tasks_data_reconcilepy) + + [lint_solidity.sh](#lint_soliditysh) + + [manage_cache.py](#manage_cachepy) + + [measure_import_times.py](#measure_import_timespy) + + [mk_targets](#mk_targets) + + [notebooks](#notebooks) + + [old](#old) + + [parallel_script_skeleton.py](#parallel_script_skeletonpy) + + [poetry](#poetry) + + [process_prof.py](#process_profpy) + + [release_encrypted_model.sh](#release_encrypted_modelsh) + + [release_sorrentum](#release_sorrentum) + + [remove_escape_chars.py](#remove_escape_charspy) + + [remove_jupyter_metadata.sh](#remove_jupyter_metadatash) + + [remove_redundant_paths.sh](#remove_redundant_pathssh) + + [replace_text.py](#replace_textpy) + + [run_profiling.sh](#run_profilingsh) + + [save_screenshot.sh](#save_screenshotsh) + + [script_skeleton.py](#script_skeletonpy) + + [string_to_file.py](#string_to_filepy) + + [sync_repo.sh](#sync_reposh) + + [test](#test) + + [testing](#testing) + + [tg.py](#tgpy) + + [to_clean/](#to_clean) + + [toml_merge.py](#toml_mergepy) + + [transform_skeleton.py](#transform_skeletonpy) + + [url.py](#urlpy) + + [zip_files.py](#zip_filespy) -# Repo integration + -## clean_up_text_files.sh +# Dev Scripts Catalogue -# Searching +## `bash` -## ack +### kga -## ffind.py +### mkbak -## jack -## jackipynb -## jackmd -## jackmk -## jackppy -## jackpy -## jackpyc -## jacktxt +### path -# Markdown +### print_paths.sh + +### timestamp + +### tree.sh + +## Development + +### create_class_diagram.sh + +### ctags.sh + +### call_graph.sh + +### go_amp.sh + +### setenv_amp.configure_env.sh + +### setenv_amp.sh + +### tmux_amp.sh + +### tmux_kill_session.sh + +## Repo integration + +### clean_up_text_files.sh + +## Searching + +### ack + +### ffind.py + +### jack + +### jackipynb + +### jackmd + +### jackmk + +### jackppy + +### jackpy + +### jackpyc + +### jacktxt + +## Markdown + +### lint_md.sh -## lint_md.sh - Lint a markdown file -## convert_gdoc_to_markdown.sh +### convert_gdoc_to_markdown.sh + - Convert a Google Doc in docx format to markdown removing artifacts -## remove_empty_lines.sh +### remove_empty_lines.sh + - Used in vim to remove empty spaces -- - -# `vim` - -## traceback_to_cfile.py - -## viack -## vic -## vigit -## vigitp -## vil -## vit -## viw - -# To reorg - -## aws -## cie -## cleanup_scripts -## client_setup -## code_stats.sh -## compile_all.py -## compress_files.sh -## cvxpy_setup -## diff_to_vimdiff.py -## docker_clean_all.sh -## docker_clean_postgres.sh -## email_notify.py -## export_vars.sh -## find_unused_golden_files.py -## fix_perms.sh -## git -## github -## grsync.py -## helpers.sh -## infra -## integrate_repos - -## lib_tasks_data_qa.py -## lib_tasks_data_reconcile.py -## lint_solidity.sh -## manage_cache.py -## measure_import_times.py -## mk_targets -## notebooks -## old -## parallel_script_skeleton.py -## poetry -## process_prof.py -## release_encrypted_model.sh -## release_sorrentum -## remove_escape_chars.py -## remove_jupyter_metadata.sh -## remove_redundant_paths.sh -## replace_text.py -## run_profiling.sh -## save_screenshot.sh -## script_skeleton.py -## string_to_file.py -## sync_repo.sh -## test -## testing -## tg.py - -## to_clean/ - -## toml_merge.py -## transform_skeleton.py -## url.py - -## zip_files.py +- + +## `vim` + +### traceback_to_cfile.py + +### viack + +### vic + +### vigit + +### vigitp + +### vil + +### vit + +### viw + +## To reorg + +### aws + +### cie + +### cleanup_scripts + +### client_setup + +### code_stats.sh + +### compile_all.py + +### compress_files.sh + +### cvxpy_setup + +### diff_to_vimdiff.py + +### docker_clean_all.sh + +### docker_clean_postgres.sh + +### email_notify.py + +### export_vars.sh + +### find_unused_golden_files.py + +### fix_perms.sh + +### git + +### github + +### grsync.py + +### helpers.sh + +### infra + +### integrate_repos + +### lib_tasks_data_qa.py + +### lib_tasks_data_reconcile.py + +### lint_solidity.sh + +### manage_cache.py + +### measure_import_times.py + +### mk_targets + +### notebooks + +### old + +### parallel_script_skeleton.py + +### poetry + +### process_prof.py + +### release_encrypted_model.sh + +### release_sorrentum + +### remove_escape_chars.py + +### remove_jupyter_metadata.sh + +### remove_redundant_paths.sh + +### replace_text.py + +### run_profiling.sh + +### save_screenshot.sh + +### script_skeleton.py + +### string_to_file.py + +### sync_repo.sh + +### test + +### testing + +### tg.py + +### to_clean/ + +### toml_merge.py + +### transform_skeleton.py + +### url.py + +### zip_files.py diff --git a/docs/kaizenflow/all.install_helpers.how_to_guide.md b/docs/kaizenflow/all.install_helpers.how_to_guide.md index bec5d81142..c8255cbb6b 100644 --- a/docs/kaizenflow/all.install_helpers.how_to_guide.md +++ b/docs/kaizenflow/all.install_helpers.how_to_guide.md @@ -1,4 +1,4 @@ - +# Install Helpers @@ -17,7 +17,7 @@ -# Helpers Distribution Package +## Helpers Distribution Package - This document describes how to build, distribute and install the `helpers` package @@ -25,9 +25,9 @@ - Note for dev/data science members: if you are looking for how to install packages for your daily work, go to **Client Configuration**. -# Creating and installing the package +## Creating and installing the package -## PyPI local file +### PyPI local file - You can create the `helpers` package with: @@ -44,7 +44,7 @@ > python -m pip install dist/helpers-1.0.0-py3-none-any.whl ``` -## PyPI workflow +### PyPI workflow - This section describes a temporary solution while we build the CI pipeline. @@ -69,9 +69,9 @@ 4. Run `python setup.py sdist upload -r part` -# PyPI server installation +## PyPI server installation -## General Information +### General Information - We use [pypiserver](https://github.com/pypiserver/pypiserver) as a corporate PyPI Index server for installing `pip` @@ -82,7 +82,7 @@ - Wheels, bdists, eggs can be uploaded either with `pip`, `setuptools` or simply copied with `scp` to the server directory -## Client Configuration / Installation +### Client Configuration / Installation You have two options: @@ -125,7 +125,7 @@ You have two options: - **Note** that pip search does not currently work with the /simple/ endpoint. -## Server Details +### Server Details **Simple Index WebUI**: http://172.31.36.23:8855/simple @@ -133,7 +133,7 @@ You have two options: **Runtime**: by docker (standalone container) -## Server Configuration +### Server Configuration - The corporate PyPI Index server runs with Docker as a standalone container with mapped volumes on the host. @@ -152,23 +152,22 @@ You have two options: > docker run -d -p 8855:8080 -v ~/pypi/packages:/data/packages -v ~/pypi/.htpasswd:/data/.htpasswd --restart=always pypiserver/pypiserver:latest -v -P .htpasswd packages ``` -## Limitation +### Limitation - The `pypiserver` does not implement the full API as seen on [PyPI](https://pypi.org/). It implements just enough to make `pip install`, and `search` work. -## Links +### Links - [pip user guide](https://pip.pypa.io/en/stable/user_guide/#user-guide) - [pypiserver](https://github.com/pypiserver/pypiserver) - [setuptool](https://setuptools.readthedocs.io/en/latest/index.html) - [packaging python projects](https://packaging.python.org/tutorials/packaging-projects/) -# Code organization in `helpers` +## Code organization in `helpers` - In `helpers` the following hierarchy should be respected: - - `repo_config.py` - `hwarnings`, `hserver`, `hlogging` - `hdbg` diff --git a/docs/kaizenflow/all.run_Mock2_in_batch_mode.how_to_guide.md b/docs/kaizenflow/all.run_Mock2_in_batch_mode.how_to_guide.md index 4e7b1fa5a4..d965797f1a 100644 --- a/docs/kaizenflow/all.run_Mock2_in_batch_mode.how_to_guide.md +++ b/docs/kaizenflow/all.run_Mock2_in_batch_mode.how_to_guide.md @@ -1,4 +1,4 @@ - +# Run Mock2 In Batch Mode @@ -14,7 +14,7 @@ The goal is to run a simple system (Mock2) end-to-end in batch mode and compute PnL. This is the typical flow that Quants run to estimate performance of a model. -# Description of the forecast system +## Description of the forecast system - A notebook running the forecast system interactively is [/docs/kaizenflow/all.run_Mock2_pipeline_in_notebook.how_to_guide.ipynb](/docs/kaizenflow/all.run_Mock2_pipeline_in_notebook.how_to_guide.ipynb) @@ -35,7 +35,7 @@ model. - Finally the entire `DAG` is run -# Description of the System +## Description of the System - The same `System` can be built using various utilities from dataflow/system/system.py @@ -49,7 +49,7 @@ model. - Concrete fully-configured `System`s are built in [/dataflow_amp/system/mock2/mock2_forecast_system_example.py](/dataflow_amp/system/mock2/mock2_forecast_system_example.py) -# Run a backtest +## Run a backtest Pull the latest `master` ``` @@ -66,7 +66,7 @@ Pull the latest `master` - The script runs a backtest for a simple dummy "strategy" using equities data for 1 month (2023-08) and 1 asset (MSFT). Trading frequency is 5 minutes. -## Explanation of the backtesting script +### Explanation of the backtesting script - Inside `docker_bash` @@ -96,7 +96,7 @@ Pull the latest `master` [/docs/dataflow/ck.run_backtest.how_to_guide.md](/docs/dataflow/ck.run_backtest.how_to_guide.md) for more details -# Analyze the results +## Analyze the results - ``` > i docker_jupyter diff --git a/docs/kaizenflow/all.run_end_to_end_Mock2_system.tutorial.md b/docs/kaizenflow/all.run_end_to_end_Mock2_system.tutorial.md index bd378386b5..a0d31f9dc5 100644 --- a/docs/kaizenflow/all.run_end_to_end_Mock2_system.tutorial.md +++ b/docs/kaizenflow/all.run_end_to_end_Mock2_system.tutorial.md @@ -1,4 +1,4 @@ - +# Run End To End Mock2 System @@ -16,7 +16,7 @@ -# Overview +## Overview The goal is to run a `System` with `Portfolio` in the replayed time mode for a few bars. @@ -39,7 +39,7 @@ flowchart LR DataFrameBroker --> Trades ``` -# High-level architecture and code organization +## High-level architecture and code organization - `dataflow_amp/pipelines/mock2/mock2_pipeline.py` - Builder that creates the prediction model @@ -73,7 +73,7 @@ flowchart LR - `dataflow_amp/system/mock2/scripts/run_end_to_end_Mock2_system.py` - Run an end-to-end streaming simulation -# System Configuration +## System Configuration `System` parameters are controlled via `SystemConfig`, which is built `dataflow_amp/system/mock2/scripts/run_end_to_end_Mock2_system.py`. @@ -82,10 +82,10 @@ The snippet of code below configures the input data, e.g., bar duration, history amount, number of assets ```python -# Bar duration in seconds, e.g., 60 * 60 is 1 hour bar. +## Bar duration in seconds, e.g., 60 * 60 is 1 hour bar. system.config["bar_duration_in_seconds"] = 60 * 60 system.config["market_data_config", "number_of_assets"] = 10 -# History amount, e.g., 10 days worth of data. +## History amount, e.g., 10 days worth of data. system.config["market_data_config", "history_lookback"] = pd.Timedelta( days=10 ) @@ -258,9 +258,9 @@ dag_runner_config (marked_as_used=False, writer=None, val_type=core.config.confi event_loop_object (marked_as_used=True, writer=/app/amp/dataflow/system/system_builder_utils.py::711::get_DataFramePortfolio_from_System, val_type=helpers.hasyncio._EventLoop): <_EventLoop running=False closed=False debug=False> ``` -# System components +## System components -## MarketData +### MarketData The source of data is `ReplayedMarketData`, an object that can replay a synthetic or previously capture dataframe. The data is represented by random @@ -272,10 +272,12 @@ Data snippet: ```markdown start_datetime timestamp_db open high low close volume asset_id -end_datetime -2023-08-08 15:00:00-04:00 2023-08-08 14:00:00-04:00 2023-08-08 15:00:10-04:00 985.34 986.36 984.87 986.25 935.0 0 -2023-08-08 15:00:00-04:00 2023-08-08 14:00:00-04:00 2023-08-08 15:00:10-04:00 1005.38 1006.65 1005.07 1006.34 948.0 1 -2023-08-08 15:00:00-04:00 2023-08-08 14:00:00-04:00 2023-08-08 15:00:10-04:00 1002.44 1002.49 1001.60 1001.88 1013.0 2 + +end_datetime 2023-08-08 15:00:00-04:00 2023-08-08 14:00:00-04:00 2023-08-08 +15:00:10-04:00 985.34 986.36 984.87 986.25 935.0 0 2023-08-08 15:00:00-04:00 +2023-08-08 14:00:00-04:00 2023-08-08 15:00:10-04:00 1005.38 1006.65 1005.07 +1006.34 948.0 1 2023-08-08 15:00:00-04:00 2023-08-08 14:00:00-04:00 2023-08-08 +15:00:10-04:00 1002.44 1002.49 1001.60 1001.88 1013.0 2 ``` We also control via `SystemConfig`: @@ -286,7 +288,7 @@ We also control via `SystemConfig`: - Delay in seconds, i.e. the `System` simulates delay and waits for data to become available for `X` seconds -## DagBuilder +### DagBuilder A `DagBuilder` configures `Nodes` and connects them into a `Dag` in order to generate forecasts. The `DagBuilder` used in this example is the toy model @@ -303,7 +305,7 @@ generate forecasts. The `DagBuilder` used in this example is the toy model - Adjusts returns using volatility - Clips returns -## DAG +### DAG `Dag` is represented by: @@ -315,7 +317,7 @@ generate forecasts. The `DagBuilder` used in this example is the toy model One can configure portfolio construction (e.g, maximum portfolio notional, i.e. `target_gmv`) via `SystemConfig`. -## DagRunner +### DagRunner `DagRunner` is represented by `RealTimeDagRunner` which is an executor that controls how to run the `System` in streaming mode (both real-time and @@ -324,7 +326,7 @@ simulated). One can configure any parameter (e.g., for how long to run the `System`, e.g., for 2 bars) via `SystemConfig`. -## Portfolio +### Portfolio `Portfolio` is implemented by `DataFramePortfolio` with a `DataFrameBroker` which: @@ -333,7 +335,7 @@ which: - Has no advanced mechanism to control trade execution, i.e. all orders always are fully filled -# System run +## System run To run the `System` and save logs execute the following cmd: ``` @@ -344,29 +346,29 @@ The System starts at `2023-08-15 11:00:00-04:00` and computes the DAG for 2 bars. ```markdown -# Real-time loop: num_it=1: rt_time_out_in_secs=7200 wall_clock_time='2023-08-15 11:00:00-04:00' real_wall_clock_time='2024-01-05 08:39:32.981505-05:00' +## Real-time loop: num_it=1: rt_time_out_in_secs=7200 wall_clock_time='2023-08-15 11:00:00-04:00' real_wall_clock_time='2024-01-05 08:39:32.981505-05:00' ``` It waits for the data to become available for 10 seconds (configurable): ```markdown -### waiting on last bar: num_iter=10/120: current_bar_timestamp=2023-08-15 11:00:00-04:00 wall_clock_time=2023-08-15 11:00:10-04:00 last_db_end_time=2023-08-15 11:00:00-04:00 +#### waiting on last bar: num_iter=10/120: current_bar_timestamp=2023-08-15 11:00:00-04:00 wall_clock_time=2023-08-15 11:00:10-04:00 last_db_end_time=2023-08-15 11:00:00-04:00 08:39:33 rss=0.292GB vms=1.261GB mem_pct=1% Task-3 hprint.py log_frame:604 -# Waiting on last bar: done +## Waiting on last bar: done ``` And once the data is ready the `System` computes the `Dag`: ```markdown -################################################################################ +################################################################################# Executing method 'predict' for node topological_id=0 nid='read_data' ... -################################################################################ +################################################################################# ... -################################################################################ +################################################################################# Executing method 'predict' for node topological_id=8 nid='process_forecasts' ... -################################################################################ +################################################################################# ``` When executing the `ProcessForecastsNode` the System: @@ -376,42 +378,80 @@ When executing the `ProcessForecastsNode` the System: - Submits orders (in this case all orders are fully filled) ```markdown -# last target positions= +## last target positions= + holdings_shares price holdings_notional wall_clock_timestamp prediction volatility spread target_holdings_notional target_trades_notional target_trades_shares target_holdings_shares -asset_id -0 0 983.66 0 2023-08-15 11:00:11-04:00 0.9674 0.000967 0 586609.98842 586609.98842 596.35442 596.35442 -1 0 1010.59 0 2023-08-15 11:00:11-04:00 0.772048 0.001099 0 454185.047454 454185.047454 449.42563 449.42563 -2 0 1005.54 0 2023-08-15 11:00:11-04:00 -0.607692 0.000675 0 -1202955.27038 -1202955.27038 -1196.327615 -1196.327615 -... -7 0 1014.55 0 2023-08-15 11:00:11-04:00 -0.632565 0.000568 0 -1699546.60519 -1699546.60519 -1675.17284 -1675.17284 -8 0 996.88 0 2023-08-15 11:00:11-04:00 -1.287815 0.000904 0 -670108.728394 -670108.728394 -672.206011 -672.206011 -9 0 993.89 0 2023-08-15 11:00:11-04:00 -0.02503 0.00066 0 1259035.977478 1259035.977478 1266.775979 1266.775979 -... -# last orders= -Order: order_id=0 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=0 type_=price@twap start_timestamp=2023-08-15 11:00:11-04:00 end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 diff_num_shares=596.354419637 tz=America/New_York extra_params={} -Order: order_id=1 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=1 type_=price@twap start_timestamp=2023-08-15 11:00:11-04:00 end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 diff_num_shares=449.425630032 tz=America/New_York extra_params={} -Order: order_id=2 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=2 type_=price@twap start_timestamp=2023-08-15 11:00:11-04:00 end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 diff_num_shares=-1196.327615391 tz=America/New_York extra_params={} -Order: order_id=3 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=3 type_=price@twap start_timestamp=2023-08-15 11:00:11-04:00 end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 diff_num_shares=-686.685193608 tz=America/New_York extra_params={} -Order: order_id=4 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=4 type_=price@twap start_timestamp=2023-08-15 11:00:11-04:00 end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 diff_num_shares=386.275392292 tz=America/New_York extra_params={} -Order: order_id=5 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=5 type_=price@twap start_timestamp=2023-08-15 11:00:11-04:00 end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 diff_num_shares=1960.903428683 tz=America/New_York extra_params={} -Order: order_id=6 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=6 type_=price@twap start_timestamp=2023-08-15 11:00:11-04:00 end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 diff_num_shares=-1086.072750644 tz=America/New_York extra_params={} -Order: order_id=7 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=7 type_=price@twap start_timestamp=2023-08-15 11:00:11-04:00 end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 diff_num_shares=-1675.172840363 tz=America/New_York extra_params={} -Order: order_id=8 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=8 type_=price@twap start_timestamp=2023-08-15 11:00:11-04:00 end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 diff_num_shares=-672.206011149 tz=America/New_York extra_params={} -Order: order_id=9 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=9 type_=price@twap start_timestamp=2023-08-15 11:00:11-04:00 end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 diff_num_shares=1266.775978708 tz=America/New_York extra_params={} + +asset_id 0 0 983.66 0 2023-08-15 11:00:11-04:00 0.9674 0.000967 0 586609.98842 +586609.98842 596.35442 596.35442 1 0 1010.59 0 2023-08-15 11:00:11-04:00 +0.772048 0.001099 0 454185.047454 454185.047454 449.42563 449.42563 2 0 1005.54 +0 2023-08-15 11:00:11-04:00 -0.607692 0.000675 0 -1202955.27038 -1202955.27038 +-1196.327615 -1196.327615 ... 7 0 1014.55 0 2023-08-15 11:00:11-04:00 -0.632565 +0.000568 0 -1699546.60519 -1699546.60519 -1675.17284 -1675.17284 8 0 996.88 0 +2023-08-15 11:00:11-04:00 -1.287815 0.000904 0 -670108.728394 -670108.728394 +-672.206011 -672.206011 9 0 993.89 0 2023-08-15 11:00:11-04:00 -0.02503 0.00066 +0 1259035.977478 1259035.977478 1266.775979 1266.775979 ... + +## last orders= + +Order: order*id=0 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=0 +type*=price@twap start*timestamp=2023-08-15 11:00:11-04:00 +end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 +diff_num_shares=596.354419637 tz=America/New_York extra_params={} Order: +order_id=1 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=1 +type*=price@twap start*timestamp=2023-08-15 11:00:11-04:00 +end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 +diff_num_shares=449.425630032 tz=America/New_York extra_params={} Order: +order_id=2 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=2 +type*=price@twap start*timestamp=2023-08-15 11:00:11-04:00 +end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 +diff_num_shares=-1196.327615391 tz=America/New_York extra_params={} Order: +order_id=3 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=3 +type*=price@twap start*timestamp=2023-08-15 11:00:11-04:00 +end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 +diff_num_shares=-686.685193608 tz=America/New_York extra_params={} Order: +order_id=4 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=4 +type*=price@twap start*timestamp=2023-08-15 11:00:11-04:00 +end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 +diff_num_shares=386.275392292 tz=America/New_York extra_params={} Order: +order_id=5 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=5 +type*=price@twap start*timestamp=2023-08-15 11:00:11-04:00 +end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 +diff_num_shares=1960.903428683 tz=America/New_York extra_params={} Order: +order_id=6 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=6 +type*=price@twap start*timestamp=2023-08-15 11:00:11-04:00 +end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 +diff_num_shares=-1086.072750644 tz=America/New_York extra_params={} Order: +order_id=7 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=7 +type*=price@twap start*timestamp=2023-08-15 11:00:11-04:00 +end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 +diff_num_shares=-1675.172840363 tz=America/New_York extra_params={} Order: +order_id=8 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=8 +type*=price@twap start*timestamp=2023-08-15 11:00:11-04:00 +end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 +diff_num_shares=-672.206011149 tz=America/New_York extra_params={} Order: +order_id=9 creation_timestamp=2023-08-15 11:00:11-04:00 asset_id=9 +type*=price@twap start_timestamp=2023-08-15 11:00:11-04:00 +end_timestamp=2023-08-15 12:00:00-04:00 curr_num_shares=0.0 +diff_num_shares=1266.775978708 tz=America/New_York extra_params={} ``` Then the `System` goes to sleep waiting for the next bar to start: + ```markdown -08:39:37 rss=0.296GB vms=1.325GB mem_pct=1%% Task-3 process_forecasts_.py process_forecasts:353 Event: exiting process_forecasts() for loop. -08:39:37 rss=0.296GB vms=1.325GB mem_pct=1% Task-3 real_time_dag_runner.py _run_dag:264 Waiting on node 'process_forecasts': done -08:39:37 rss=0.296GB vms=1.325GB mem_pct=1% Task-1 real_time.py execute_with_real_time_loop:422 await done (wall_clock_time=2023-08-15 12:00:00-04:00) +08:39:37 rss=0.296GB vms=1.325GB mem*pct=1%% Task-3 process_forecasts*.py +process_forecasts:353 Event: exiting process_forecasts() for loop. 08:39:37 +rss=0.296GB vms=1.325GB mem_pct=1% Task-3 real_time_dag_runner.py \_run_dag:264 +Waiting on node 'process_forecasts': done 08:39:37 rss=0.296GB vms=1.325GB +mem_pct=1% Task-1 real_time.py execute_with_real_time_loop:422 await done +(wall_clock_time=2023-08-15 12:00:00-04:00) ``` Since the clock is simulated, instead of waiting 1 hour, the `System` moves the clock forward within a few seconds and starts to compute the 2nd bar: ```markdown -# Real-time loop: num_it=2: rt_time_out_in_secs=7200 wall_clock_time='2023-08-15 12:00:00-04:00' real_wall_clock_time='2024-01-05 08:39:37.416849-05:00' +## Real-time loop: num_it=2: rt_time_out_in_secs=7200 wall_clock_time='2023-08-15 12:00:00-04:00' real_wall_clock_time='2024-01-05 08:39:37.416849-05:00' ``` Then the `System` repeats the `Dag` computation and order submission but for the @@ -419,9 +459,13 @@ next bar and exits once the termination condition becomes True (run for 2 bars in this case): ```markdown -08:39:42 rss=0.297GB vms=1.326GB mem_pct=1% Task-1 hwall_clock_time.py set_current_bar_timestamp:105 timestamp=2023-08-15 13:00:00-04:00 -08:39:42 rss=0.297GB vms=1.326GB mem_pct=1%% Task-1 real_time.py execute_with_real_time_loop:433 rt_timeout_in_secs_or_time=7200, bar_duration_in_secs=3600, num_it=2, num_iterations=2, is_done=True -08:39:42 rss=0.297GB vms=1.326GB mem_pct=1% - ^[[36mINFO ^[[0m Task-1 real_time.py execute_with_real_time_loop:443 Exiting loop: num_it=2, num_iterations=2 +08:39:42 rss=0.297GB vms=1.326GB mem_pct=1% Task-1 hwall_clock_time.py +set_current_bar_timestamp:105 timestamp=2023-08-15 13:00:00-04:00 08:39:42 +rss=0.297GB vms=1.326GB mem_pct=1%% Task-1 real_time.py +execute_with_real_time_loop:433 rt_timeout_in_secs_or_time=7200, +bar_duration_in_secs=3600, num_it=2, num_iterations=2, is_done=True 08:39:42 +rss=0.297GB vms=1.326GB mem_pct=1% - ^[[36mINFO ^[[0m Task-1 real_time.py +execute_with_real_time_loop:443 Exiting loop: num_it=2, num_iterations=2 ``` The output is in the dir: @@ -460,4 +504,4 @@ system_log_dir/ 10 directories, 18 files ``` -# TODO(gp): @all Describe the output +## TODO(gp): @all Describe the output diff --git a/docs/mkdocs/assets/favicon.ico b/docs/mkdocs/assets/favicon.ico new file mode 100644 index 0000000000..5cc94f975c Binary files /dev/null and b/docs/mkdocs/assets/favicon.ico differ diff --git a/docs/mkdocs/assets/logo.png b/docs/mkdocs/assets/logo.png new file mode 100644 index 0000000000..9d7b16fe40 Binary files /dev/null and b/docs/mkdocs/assets/logo.png differ diff --git a/docs/mkdocs/styles/styles.css b/docs/mkdocs/styles/styles.css new file mode 100644 index 0000000000..b0fd0dad8d --- /dev/null +++ b/docs/mkdocs/styles/styles.css @@ -0,0 +1,9 @@ +.md-logo img { + height: auto; + width: 7.2rem; +} + +.md-header__button.md-logo { + margin: .4rem; + padding: .6rem; +} \ No newline at end of file diff --git a/docs/onboarding/all.communicate_in_telegram.how_to_guide.md b/docs/onboarding/all.communicate_in_telegram.how_to_guide.md index 7b624ad03f..6cdf1ca68d 100644 --- a/docs/onboarding/all.communicate_in_telegram.how_to_guide.md +++ b/docs/onboarding/all.communicate_in_telegram.how_to_guide.md @@ -1,4 +1,6 @@ -# Telegram +# Communicate In Telegram + +## Telegram @@ -11,18 +13,18 @@ -# General +## General - We use [Telegram](https://telegram.org/) for - Discussions that need - - tight interaction (like a debug session) - - immediacy (e.g., "are you ready for the sync up?") + - Tight interaction (like a debug session) + - Immediacy (e.g., "are you ready for the sync up?") - Github Actions notifications from Telegram bots - E.g., regressions fail in one of our repos -# Secret vs Regular chats +## Secret vs Regular chats -## Secret +### Secret - We use secret chats for private one-on-one communication - We prefer to send all the sensitive information using encrypted chats @@ -39,20 +41,20 @@ Telegram cloud. This means you can only access messages in a secret chat from their device of origin. -## Regular +### Regular - We use regular chats for - General discussions within a team - Group chats - We do not share sensitive information via regular chats -# Username +## Username - We ask everyone to set a username so that is easier to find a person - See the instructions [here](https://telegram.org/faq#q-what-are-usernames-how-do-i-get-one) -# Google meet room +## Google meet room - It is always nice to pin a google meeting room in a chat - We usually use `->` as an invitation to join a google meet room pinned in a diff --git a/docs/onboarding/all.development_documents.reference.md b/docs/onboarding/all.development_documents.reference.md index 2478afbd31..ed2bacc811 100644 --- a/docs/onboarding/all.development_documents.reference.md +++ b/docs/onboarding/all.development_documents.reference.md @@ -1,3 +1,4 @@ +# Development Documents @@ -12,11 +13,11 @@ image -# On-boarding +## On-boarding - [Signing up for KaizenFlow](/docs/onboarding/kaizenflow.signing_up.how_to_guide.md) -# How to start developing +## How to start developing This contains the absolute minimal amount of info to start developing @@ -34,7 +35,7 @@ This contains the absolute minimal amount of info to start developing - [Unit tests](/docs/coding/all.write_unit_tests.how_to_guide.md) -# Project management +## Project management - [Contributor Scoring](/docs/work_organization/all.contributor_scoring.how_to_guide.md) - How we give feedback to contributors @@ -62,7 +63,7 @@ This contains the absolute minimal amount of info to start developing - [How to integrate repos](/docs/coding/all.integrate_repos.how_to_guide.md) - Detailed instruction on how to integrate repos -# Learn how to become efficient at developing +## Learn how to become efficient at developing This contains a set of resources that over time will make 10x more productive @@ -76,7 +77,7 @@ This contains a set of resources that over time will make 10x more productive - [Scrum Methodology](/docs/work_organization/all.scrum.explanation.md) -# In-depth docs +## In-depth docs - [Code organization](/docs/all.code_organization.reference.md) @@ -98,7 +99,7 @@ This contains a set of resources that over time will make 10x more productive - [PyCharm](/docs/work_tools/all.pycharm.how_to_guide.md) -# DeFi +## DeFi - [DeFi README](/defi/README.md) diff --git a/docs/onboarding/all.onboarding_checklist.md b/docs/onboarding/all.onboarding_checklist.md index 4d3d35d172..308727da23 100644 --- a/docs/onboarding/all.onboarding_checklist.md +++ b/docs/onboarding/all.onboarding_checklist.md @@ -1,4 +1,4 @@ - +# Onboarding Checklist @@ -14,21 +14,22 @@ * [The first day!](#the-first-day) + [Team member info](#team-member-info) + [NDA](#nda) + + [Hubstaff](#hubstaff) + [IT setup](#it-setup) -- [Checklists to verify that everything works](#checklists-to-verify-that-everything-works) +- [Quick checklists to verify that everything works](#quick-checklists-to-verify-that-everything-works) * [The second day](#the-second-day) -# Onboarding process for a new team member +## Onboarding process for a new team member -## Meta +### Meta -### Make on-boarding automatic +#### Make on-boarding automatic - We want to make the onboarding process as automatic as possible -### Be patient +#### Be patient - Let's use all the communication tools we have (screen sharing, Google Meet, phone, Telegram, email) to keep the process smooth and personal @@ -36,7 +37,7 @@ inevitably go wrong - Let's all be patient with each other -### Ask for confirmation +#### Ask for confirmation - Use checklists any time to make sure things get done - Ask for confirmation of all the actions, e.g., @@ -46,7 +47,7 @@ - Make the new team member follow the instructions so that they can get familiar with the systems -### Make on-boarding similar to our work routine +#### Make on-boarding similar to our work routine - Provide tools for interacting with the team - During the process the new teammate will ask questions and he / she should @@ -63,7 +64,7 @@ - He / she should put his / her notes in the GH issue while proceeding during the onboarding process -### Improve on-boarding process +#### Improve on-boarding process - Review, improve, clarify process - We want to point the new team member to a document rather than explaining @@ -78,7 +79,7 @@ - Open the first GitHub bug on the first day - Open the first PR by the second day -## People involved in the on-boarding +### People involved in the on-boarding - Each task is performed by one of the persons involved in the on-boarding - Team leader (e.g., GP, Paul, Grisha, Samarth) @@ -86,7 +87,7 @@ - IT (e.g., Shayan) - HR (e.g., Rose) -## Before the start date +### Before the start date - [ ] **Team leader**: talk to the teams about the new team member joining @@ -102,9 +103,9 @@ - What is going to work on initially - LinkedIn link -## The first day! +### The first day! -### Team member info +#### Team member info - [ ] **Team member**: send needed information to your team leader - Full name: @@ -117,9 +118,9 @@ - User's SSH public key - [ ] **Team leader**: update the - [Team member info gdoc](https://docs.google.com/document/d/1gmcmLzaopYWEycx1AbBMupG9hJhcxPCxN85l23_fZGk) + [Team member info gdoc](https://docs.google.com/document/d/1gmcmLzaopYWEycx1AbBMupG9hJhcxPCxN85l23_fZGk) -### NDA +#### NDA - [ ] **HR**: send the team member an NDA to sign [NDAs Development Team](https://drive.google.com/drive/u/0/folders/1lcHmu14jz_bXscZPIatbbMTwfwA6e3eV) @@ -128,7 +129,7 @@ - [ ] **HR**: store in [signed directory](https://drive.google.com/drive/u/0/folders/17T2IdKOMAmyfU3hkmMo3Eo6qiejBFy3r) -### Hubstaff +#### Hubstaff - [ ] **HR**: Update Hubstaff - Add user [here](https://app.hubstaff.com/organizations/398809/invites) @@ -141,7 +142,7 @@ - Read [Tools - Hubstaff](/docs/onboarding/all.track_time_with_hubstaff.how_to_guide.md) -### IT setup +#### IT setup - [ ] **Team leader**: File an issue with this checklist - The title is "Onboarding {{Name}}" @@ -198,10 +199,10 @@ - [ ] [Kaizen-ai](https://github.com/kaizen-ai/kaizenflow) - [ ] [dev_tools](https://github.com/kaizen-ai/dev_tools) - [ ] [cmamp](https://github.com/cryptokaizen/cmamp/settings/access) - - (On per-need basis) - [ ] [orange](https://github.com/cryptokaizen/orange/settings/access) - - (On per-need basis) - [ ] [UMD_data605](https://github.com/gpsaggese/umd_data605/settings/access) + - (On per-need basis) [ ] + [orange](https://github.com/cryptokaizen/orange/settings/access) + - (On per-need basis) [ ] + [UMD_data605](https://github.com/gpsaggese/umd_data605/settings/access) - [ ] **Team member**: Confirm access to GitHub repos @@ -210,7 +211,7 @@ - [Invite](https://app.zenhub.com/workspaces/cm-615371012ed326001e044788/board?invite=true) - [ ] **Team member**: Confirm access to ZenHub - [here](https://app.zenhub.com/workspaces/cm-615371012ed326001e044788/board?invite=true) + [here](https://app.zenhub.com/workspaces/cm-615371012ed326001e044788/board?invite=true) - [ ] **IT**: Server set-up - We use the personal laptop as a thin client only to connect to the servers, @@ -239,48 +240,41 @@ one of your face, so we can virtually get to know each other - [ ] **Team member**: Make sure you have access to the - [vacation/OOTO calendar](https://calendar.google.com/calendar/u/0?cid=Y19kYWRlOGU0NTUwMzhiMDllMmUzNDk1OWM2YzFkYWNhYTVmMTAzYjdjZmNiODQ1MDkzOWZhMTBkZDY2NWI3ZjJhQGdyb3VwLmNhbGVuZGFyLmdvb2dsZS5jb20). + [vacation/OOTO calendar](https://calendar.google.com/calendar/u/0?cid=Y19kYWRlOGU0NTUwMzhiMDllMmUzNDk1OWM2YzFkYWNhYTVmMTAzYjdjZmNiODQ1MDkzOWZhMTBkZDY2NWI3ZjJhQGdyb3VwLmNhbGVuZGFyLmdvb2dsZS5jb20). - The link should be accessible and you should see the calendar in the list of calendars at calendar.google.com (when accessing via your corporate email) -- [ ] **Team member**: Add your usual working hours by going to calendar.google.com - (using your corporate email), heading to the settings section by clicking the - gear icon on top right +- [ ] **Team member**: Add your usual working hours by going to + calendar.google.com (using your corporate email), heading to the settings + section by clicking the gear icon on top right - - [ ] **Team member**: Confirm you can access the anonymous form to ask anything - [https://forms.gle/KMQgobqbyxhoTR9n6](https://forms.gle/KMQgobqbyxhoTR9n6) + [https://forms.gle/KMQgobqbyxhoTR9n6](https://forms.gle/KMQgobqbyxhoTR9n6) - [ ] **Team member**: File first Issue on GitHub - It should be called "Document review while onboarding $TEAM_MEMBER" - Track what is not clear in the onboarding process / documentation and what should / could be improved -# Quick checklists to verify that everything works +## Quick checklists to verify that everything works - **Team member** - [ ] VPN to dev server - - [ ] ssh into the dev server - - [ ] Check access to AWS on the server (refer to instructions above) - - [ ] Clone the code from Git - - [ ] Connect to server with VisualStudio Code or PyCharm - There is an extension for VSCode, which allows to develop remotely [Remote - SSH](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) - Follow this instruction on how to set it up in your [Visual Studio Code](/docs/work_tools/all.visual_studio_code.how_to_guide.md) - - [ ] Run the unit tests and make sure they all pass - - [ ] Run a docker container ``` > i docker_bash ``` - - [ ] Run a jupyter notebook - Follow this [instruction](/docs/work_tools/all.visual_studio_code.how_to_guide.md#how-to-access-the-jupyter-server-running-on-the-remote-server-through-your-local-machine) @@ -290,10 +284,10 @@ > i docker_jupyter ``` -## The second day +### The second day - [ ] **Team member**: carefully study all the documents in: - [docs/onboarding](https://github.com/cryptokaizen/cmamp/tree/master/docs/onboarding) + [docs/onboarding](https://github.com/cryptokaizen/cmamp/tree/master/docs/onboarding) - Read it carefully one by one - Ask questions - Memorize / internalize all the information @@ -307,8 +301,8 @@ - [ ] **Team member**: exercise all the important parts of the systems - [ ] Create a GitHub issue - [ ] Get familiar with - [ZenHub](https://app.zenhub.com/workspaces/cm-615371012ed326001e044788/board) - [doc](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md) + [ZenHub](https://app.zenhub.com/workspaces/cm-615371012ed326001e044788/board) + [doc](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md) - Check out the code on server - Run all regressions on server - Create a branch diff --git a/docs/onboarding/all.organize_email.how_to_guide.md b/docs/onboarding/all.organize_email.how_to_guide.md index 3b3c22ceb0..667f2b2892 100755 --- a/docs/onboarding/all.organize_email.how_to_guide.md +++ b/docs/onboarding/all.organize_email.how_to_guide.md @@ -1,4 +1,6 @@ -# Email +# Organize Email + +## Email @@ -13,14 +15,15 @@ * [Gdocs](#gdocs) * [TODO emails](#todo-emails) * [Asana](#asana) + -# Mailing lists +## Mailing lists - `@all` is the mailing list with everybody at the company - `@contributors` is the mailing list with every open-source contributor -# Organizing email flow +## Organizing email flow - We receive tons of emails, and the inflow is going to keep increasing - At a large company you can get 10k emails per day (no kidding) @@ -28,12 +31,14 @@ - How can one do that? - As usual the answer is get organized - Filter emails in folders - - Separate emails in folders based on the action that they require (e.g., ignore, just read and be aware of it, read and respond) + - Separate emails in folders based on the action that they require (e.g., + ignore, just read and be aware of it, read and respond) - Read email and decide what to do about each of it: - No reply needed - Reply right away - Follow up later (e.g., to read, reply, think about it) - - Use flags to distinguish what needs to be followed up later or if you are waiting for a response + - Use flags to distinguish what needs to be followed up later or if you are + waiting for a response - A possible organization in folders is: - GitHub - Commits in all the repos (be aware of it) @@ -47,13 +52,14 @@ - New comment/activity - New task assignment -# Anatomy of email messages from infra +## Anatomy of email messages from infra - The goal is to classify emails so that we can filter email effectively -## Filtering emails with Gmail +### Filtering emails with Gmail -- Personally (GP) I prefer an email client (Mozilla Thunderbird and more recently Apple Mail) rather than using Gmail web interface +- Personally (GP) I prefer an email client (Mozilla Thunderbird and more + recently Apple Mail) rather than using Gmail web interface - People are able to use it - Personally I prefer to use filters on the Gmail (server) side - Pros @@ -62,103 +68,110 @@ - Folders are on the server side, so my client can simply sync - Cons - The Gmail interface for filtering emails is horrible -- The web interface is [https://mail.google.com/mail/u/0/#settings/filters](https://mail.google.com/mail/u/0/#settings/filters) -- Note that Gmail distinguish different email accounts using different indices, e.g., [https://mail.google.com/mail/u//#inbox](https://mail.google.com/mail/u//#inbox) +- The web interface is + [https://mail.google.com/mail/u/0/#settings/filters](https://mail.google.com/mail/u/0/#settings/filters) +- Note that Gmail distinguish different email accounts using different indices, + e.g., + [https://mail.google.com/mail/u//#inbox](https://mail.google.com/mail/u//#inbox) -## Notifications from GitHub +### Notifications from GitHub - [https://help.github.com/en/categories/receiving-notifications-about-activity-on-github](https://help.github.com/en/categories/receiving-notifications-about-activity-on-github) -## GitHub pull requests +### GitHub pull requests -- These emails look like: - - Samarth KaPatel - to cryptokaizen/orange, Subscribed +- These emails look like: - cryptokaizen/cmamp#4765 - - Refactoring amp_path on orange - - PR in cmamp - cryptokaizen/cmamp#4788 - ________________________________________________________________ + Samarth KaPatel + to cryptokaizen/orange, Subscribed - You can view, comment on, or merge this pull request online at: - https://github.com/cryptokaizen/orange/pull/411 + cryptokaizen/cmamp#4765 + - Refactoring amp_path on orange + - PR in cmamp - cryptokaizen/cmamp#4788 + ________________________________________________________________ - Commit Summary - - b2b4940 orange fix - - 850b2b2 amp + You can view, comment on, or merge this pull request online at: + https://github.com/cryptokaizen/orange/pull/411 - File Changes (2 files) - - M amp (2) - - M dataflow_orange/system/Cx/test/test_master_pnl_real_time_observer_notebook.py (5) + Commit Summary + - B2b4940 orange fix + - 850b2b2 amp + + File Changes (2 files) + - M amp (2) + - M dataflow_orange/system/Cx/test/test_master_pnl_real_time_observer_notebook.py (5) Patch Links: - - https://github.com/cryptokaizen/orange/pull/411.patch - - https://github.com/cryptokaizen/orange/pull/411.diff - -- These emails have the words: "You can view, comment on, or merge this pull request online at:" in the body of the email + - Https://github.com/cryptokaizen/orange/pull/411.patch + - Https://github.com/cryptokaizen/orange/pull/411.diff + +- These emails have the words: "You can view, comment on, or merge this pull + request online at:" in the body of the email -## GitHub issue activity +### GitHub issue activity -- These emails look like: - - Samarth KaPatel - to Review, kaizen-ai/kaizenflow +- These emails look like: + + Samarth KaPatel + to Review, kaizen-ai/kaizenflow + + @samarth9008 requested your review on: #436 Update Signing_up_for_Kaizenflow.md. + __ + Reply to this email directly, view it on GitHub, or unsubscribe. + You are receiving this because your review was requested. - @samarth9008 requested your review on: #436 Update Signing_up_for_Kaizenflow.md. - __ - Reply to this email directly, view it on GitHub, or unsubscribe. - You are receiving this because your review was requested. + or: - or: - - Grigorii Pomazkin - to cryptokaizen/cmamp, Mention + Grigorii Pomazkin + to cryptokaizen/cmamp, Mention - @PomazkinG commented on this pull request. - ________________________________________________________________________ + @PomazkinG commented on this pull request. + ________________________________________________________________________ - In helpers/lib_tasks_pytest.py: + In helpers/lib_tasks_pytest.py: - > @@ -671,17 +671,25 @@ def run_coverage_report( # type: ignore - :param aws_profile: the AWS profile to use for publishing HTML report - """ - # TODO(Grisha): allow user to specify which tests to run. - ... - obsolete, resolving - __ - Reply to this email directly, view it on GitHub, or unsubscribe. - You are receiving this because you were mentioned. + > @@ -671,17 +671,25 @@ def run_coverage_report( # type: ignore + :param aws_profile: the AWS profile to use for publishing HTML report + """ + # TODO(Grisha): allow user to specify which tests to run. + ... + obsolete, resolving + __ + Reply to this email directly, view it on GitHub, or unsubscribe. + You are receiving this because you were mentioned. -- These emails can be recognized by the fact that have the words "You are receiving this because" in the email body +- These emails can be recognized by the fact that have the words "You are + receiving this because" in the email body -## Commits +### Commits - These emails look like: - - Grigorii Pomazkin - to cryptokaizen/cmamp, Push - @PomazkinG pushed 1 commit. - - 65496bb Merge branch 'master' into CmTask4707_fix_coverage_test - __ - View it on GitHub or unsubscribe. - You are receiving this because you are subscribed to this thread. + Grigorii Pomazkin + to cryptokaizen/cmamp, Push -- These emails can be recognized by the fact that have the words "pushed commit" in the email body + @PomazkinG pushed 1 commit. + - 65496bb Merge branch 'master' into CmTask4707_fix_coverage_test + __ + View it on GitHub or unsubscribe. + You are receiving this because you are subscribed to this thread. -## Gdocs +- These emails can be recognized by the fact that have the words "pushed commit" + in the email body -- These emails have `comments-noreply@docs.google.com` or (Google Docs) in the "subject" field +### Gdocs -## TODO emails +- These emails have `comments-noreply@docs.google.com` or (Google Docs) in the + "subject" field + +### TODO emails - These emails have TODO in the subject -## Asana +### Asana - These emails are received from `no-reply@asana.com` address - These emails may contain: - - List of tasks assigned to you for today - - New activities in the tasks assigned to you - - New task assignment + - List of tasks assigned to you for today + - New activities in the tasks assigned to you + - New task assignment diff --git a/docs/onboarding/all.receive_crypto_payment.how_to_guide.md b/docs/onboarding/all.receive_crypto_payment.how_to_guide.md index aaf7144aab..58f402f6b4 100644 --- a/docs/onboarding/all.receive_crypto_payment.how_to_guide.md +++ b/docs/onboarding/all.receive_crypto_payment.how_to_guide.md @@ -1,4 +1,4 @@ - +# Receive Crypto Payment @@ -16,12 +16,12 @@ -# Choose a crypto wallet type +## Choose a crypto wallet type - Https://academy.binance.com/en/articles/crypto-wallet-types-explained - Https://www.blockchain-council.org/blockchain/types-of-crypto-wallets-explained/ -## Public / private keys +### Public / private keys See the explanation [here](https://www.gemini.com/cryptopedia/public-private-keys-cryptography) @@ -32,9 +32,9 @@ TLDR: - Private key is needed to prove ownership or spend the funds associated with your public address, see it as a password -## What do people in the team use +### What do people in the team use -### Beginner +#### Beginner Mobile / desktop hot wallet [Exodus](https://www.exodus.com/) @@ -55,7 +55,7 @@ Cons: - As any hot wallet is vulnerable to hackers attacks - It is not recommended to store significant amount of money -### Advanced +#### Advanced If you are experienced and / or own a lot of crypto you might want to have a cold wallet on your laptop, holding the bulk of your bitcoin and transfer among @@ -63,14 +63,14 @@ different website on-line wallets. https://bitcoin.org/en/choose-your-wallet -# Choose a crypto currency to receive money in +## Choose a crypto currency to receive money in - One can either receive a payment in BTC or in one of the Stablecoins (USDT is a preference) - People in the team prefer to use Stablecoins since Stablecoins pursue price stability -# Send a crypto wallet address +## Send a crypto wallet address Send your crypto wallet address to GP together with an invoice via email. E.g., ``` @@ -81,7 +81,7 @@ Please find my invoice for the period [a, b] attached. I would like to receive t My crypto wallet address is: 0x5ce3d650703f745B9C0cf20E322204b00bF59205 ``` -# Do a test transfer +## Do a test transfer For the first time GP sends a test transfer (e.g., 100$) just to confirm that a wallet address provided works. @@ -89,9 +89,9 @@ wallet address provided works. After a wallet address is "verified" send the full amount (exluding the test transfer amount). -# Cash out crypto +## Cash out crypto -## Bank account +### Bank account There are some 3rd party services that buy crypto money from you and send fiat money to your bank account @@ -102,7 +102,7 @@ E.g., - [Coinbase cash out serivce](https://help.coinbase.com/en/coinbase/trading-and-funding/buying-selling-or-converting-crypto/how-do-i-sell-or-cash-out-my-digital-currency) - [Binance P2P](https://p2p.binance.com/en/trade/all-payments/USDT?fiat=USD) -## Cash +### Cash There are some crypto ATMs in the USA, see [here](https://coinatmradar.com/country/226/bitcoin-atm-united-states/) diff --git a/docs/onboarding/all.track_time_with_hubstaff.how_to_guide.md b/docs/onboarding/all.track_time_with_hubstaff.how_to_guide.md index ac7f52542a..0c954a910e 100644 --- a/docs/onboarding/all.track_time_with_hubstaff.how_to_guide.md +++ b/docs/onboarding/all.track_time_with_hubstaff.how_to_guide.md @@ -1,29 +1,31 @@ -# Hubstaff +# Track Time With Hubstaff + +## Hubstaff - [General](#general) - [Time Tracking](#time-tracking) - - [Privacy](#privacy) - - [Tracking time automatically](#tracking-time-automatically) - - [Tracking time manually](#tracking-time-manually) - - [Overriding time tracking](#overriding-time-tracking) + * [Privacy](#privacy) + * [Tracking time automatically](#tracking-time-automatically) + * [Tracking time manually](#tracking-time-manually) + * [Overriding time tracking](#overriding-time-tracking) -# General +## General - [Hubstaff](https://hubstaff.com/) is a tool for remote working that automates: - - time tracking - - invoice creation - - payment + - Time tracking + - Invoice creation + - Payment - The goal is to replace our [hour log spreadsheet](https://docs.google.com/spreadsheets/d/1oNd6ORhc94oUzg5nhNC7fQelN_PmfAv110F7lUiZsxo/edit#gid=0) with Hubstaff and get paid automatically for the worked hours. -# Time Tracking +## Time Tracking -## Privacy +### Privacy - We have decided to turn off the feature of taking screenshots of the laptop as proof-as-work, URL, and app tracking. @@ -43,7 +45,7 @@ - - You can change your mind over time. -## Tracking time automatically +### Tracking time automatically - You can use the desktop [App](https://app.hubstaff.com/download) - I prefer this since it's completely automated @@ -55,7 +57,7 @@ - See [https://hubstaff.com/how-tracking-works](https://hubstaff.com/how-tracking-works) -## Tracking time manually +### Tracking time manually - You can track time using the [Google Chrome extension](https://chrome.google.com/webstore/detail/hubstaff-time-tracker/mipeohjjimeknlkekbemdjbjniogbgel): @@ -63,7 +65,7 @@ - Start the timer when you start the work - Stop the timer when the work is paused -## Overriding time tracking +### Overriding time tracking - In case you forgot to turn on the timer or you are tracking time manually, go to diff --git a/docs/trading_ops/all.how_to_name_objects.explanation.md b/docs/trading_ops/all.how_to_name_objects.explanation.md index ec0889a750..4abcd00d01 100644 --- a/docs/trading_ops/all.how_to_name_objects.explanation.md +++ b/docs/trading_ops/all.how_to_name_objects.explanation.md @@ -2,16 +2,19 @@ -- [Airflow DAGs](#airflow-dags) -- [Git branch names](#git-branch-names) -- [Docker image names](#docker-image-names) -- [Directory names](#directory-names) -- [Notebook names](#notebook-names) -- [Telegram channels](#telegram-channels) -- [Invoke task definition](#invoke-task-definition) +- [How to name objects](#how-to-name-objects) + * [Airflow DAGs](#airflow-dags) + * [Git branch names](#git-branch-names) + * [Docker image names](#docker-image-names) + * [Directory names](#directory-names) + * [Notebook names](#notebook-names) + * [Telegram channels](#telegram-channels) + * [Invoke task definition](#invoke-task-definition) +# How to name objects + When we have lots of accounts, experiments, production systems running at the same time, we need to standardize naming conventions. diff --git a/docs/trading_ops/all.scheduled_trading.explanation.md b/docs/trading_ops/all.scheduled_trading.explanation.md index 5512590372..6cf6c31f4b 100644 --- a/docs/trading_ops/all.scheduled_trading.explanation.md +++ b/docs/trading_ops/all.scheduled_trading.explanation.md @@ -48,10 +48,10 @@ are published: - Latest version - Stable link which contains notebooks links of the latest run. E.g., - http://172.30.2.44/notebooks/Master_Analysis/Master_trading_system_report.latest.html + http://172.30.2.44/v2/trading_ops/trading_reports/prod/C11a.config1/2024/08/Master_trading_system_report.latest.html - Timestamp version - It contains notebook links of that particular timestamp run. E.g., - http://172.30.2.44/notebooks/Master_Analysis/Master_trading_system_report.0.20240418-102140.html + http://172.30.2.44/v2/trading_ops/trading_reports/prod/C11a.config1/2024/08/Master_trading_system_report.prod.C11a.config1.20240821_210500.20240821_220000.html - Here is an example of the directory structure for the trade execution experiment: diff --git a/docs/trading_ops/all.shadow_monitoring.explanation.md b/docs/trading_ops/all.shadow_monitoring.explanation.md index c29618c603..cf287bdfea 100644 --- a/docs/trading_ops/all.shadow_monitoring.explanation.md +++ b/docs/trading_ops/all.shadow_monitoring.explanation.md @@ -10,19 +10,19 @@ - `C11a.config1`: - DAG: - http://172.30.2.114:8090/dags/preprod.tokyo.shadow_trading_system_observer.C11a.config1/grid + http://internal-a97b7f81b909649218c285140e74f68a-1285736094.eu-north-1.elb.amazonaws.com:8080/dags/preprod.tokyo.shadow_trading_system_observer.C11a.config1/grid - Stable link: http://172.30.2.44/system_reconciliation/C11a.config1.shadow_trading.last_5minutes.html - `C11a.config3`: - DAG: - http://172.30.2.114:8090/dags/preprod.tokyo.shadow_trading_system_observer.C11a.config3/grid + http://internal-a97b7f81b909649218c285140e74f68a-1285736094.eu-north-1.elb.amazonaws.com:8080/dags/preprod.tokyo.shadow_trading_system_observer.C11a.config3/grid - Stable link: http://172.30.2.44/system_reconciliation/C11a.config3.shadow_trading.last_5minutes.html - `C14a.config1`: - DAG: - http://172.30.2.114:8090/dags/preprod.tokyo.shadow_trading_system_observer.C14a.config1/grid + http://internal-a97b7f81b909649218c285140e74f68a-1285736094.eu-north-1.elb.amazonaws.com:8080/dags/preprod.tokyo.shadow_trading_system_observer.C14a.config1/grid - Stable link: http://172.30.2.44/system_reconciliation/C14a.config1.paper_trading.last_5minutes.html diff --git a/docs/trading_ops/all.trading_run_summary_sheet.explanation.md b/docs/trading_ops/all.trading_run_summary_sheet.explanation.md index b0819ee884..f047487786 100644 --- a/docs/trading_ops/all.trading_run_summary_sheet.explanation.md +++ b/docs/trading_ops/all.trading_run_summary_sheet.explanation.md @@ -1,4 +1,4 @@ - +# Trading Run Summary Sheet @@ -7,7 +7,7 @@ -# Document description +## Document description Gsheet: [2024Q2/Q3 - Scheduled trading](https://docs.google.com/spreadsheets/d/1abvqin3UXJUYmaOD999IBZDoOFGYjUk-4sP7Bd37Ukw/) @@ -18,7 +18,7 @@ available. The data is entered into the Gsheet every day before 8:30 AM ET. -# Field descriptions +## Field descriptions - **Date:** the date of the DAG run in YYYY-MM-DD format diff --git a/docs/work_organization/all.buildmeister.how_to_guide.md b/docs/work_organization/all.buildmeister.how_to_guide.md index b3937f49e9..1a5582f4b7 100644 --- a/docs/work_organization/all.buildmeister.how_to_guide.md +++ b/docs/work_organization/all.buildmeister.how_to_guide.md @@ -1,4 +1,6 @@ -# Buildmeister process +# Buildmeister + +## Buildmeister process @@ -12,7 +14,7 @@ -# General +## General - The Buildmeister rotates every 2 weeks - To see who is the Buildmeister now refer to @@ -30,7 +32,7 @@ - Additional information about the [tests](/docs/coding/all.unit_tests.how_to_guide.md) -# Notification system +## Notification system - `@CK_cmamp_buildbot` notifies the team about breaks via Telegram channel `CK build notifications` @@ -45,7 +47,7 @@ Example: - -# Buildmeister instructions +## Buildmeister instructions - You receive a break notification from `@CK_cmamp_buildbot` - Have a look at the message @@ -127,7 +129,7 @@ Example: - When your time of the Buildmeister duties is over, confirm the rotation with the next responsible person in the related Telegram chat. -### `update_amp_submodule` fails +#### `update_amp_submodule` fails - When this happens, the first thing to do is attempt to update the `amp` pointer manually @@ -148,14 +150,14 @@ Example: - There is also an invoke target `git_roll_amp_forward` that does an equivalent operation -# Buildmeister dashboard +## Buildmeister dashboard The Buildmeister dashboard is a tool that provides a quick overview of the current state of the results of all GitHub Actions workflows. See [run and publish the buildmeister dashboard](/docs/infra/ck.gh_workflows.explanation.md#run-and-publish-the-buildmeister-dashboard) for detailed information. -# Allure Reports Analysis +## Allure Reports Analysis - For a background on Allure, refer to these docs - Detailed info can be found in the official @@ -213,7 +215,7 @@ for detailed information. - The goal here is to provide more context when filing an issue so that we can make better decisions -# Post-mortem analysis (TBD) +## Post-mortem analysis (TBD) - We want to understand on why builds are broken so that we can improve the system to make it more robust diff --git a/docs/work_organization/all.contributor_scoring.how_to_guide.md b/docs/work_organization/all.contributor_scoring.how_to_guide.md index 4bb9634c7b..7c5c008b3e 100644 --- a/docs/work_organization/all.contributor_scoring.how_to_guide.md +++ b/docs/work_organization/all.contributor_scoring.how_to_guide.md @@ -19,44 +19,53 @@ ### General -- We want to evaluate and provide feedback to our team members on different aspects of their work. +- We want to evaluate and provide feedback to our team members on different + aspects of their work. -- We don't take non-perfect scores personally but just as a way to understand what to improve. +- We don't take non-perfect scores personally but just as a way to understand + what to improve. - The scoring template is here -[Scoring template](https://docs.google.com/spreadsheets/u/2/d/1WsWT8By2hr1VqB6ulIXf3_Rfa0zE2KHI/edit?usp=drive_web&ouid=106425005676808098789&rtpof=true) (this is an Excel spreadsheet since you need to upload it and it needs to be a file and not a Google Sheet). + [Scoring template](https://docs.google.com/spreadsheets/u/2/d/1WsWT8By2hr1VqB6ulIXf3_Rfa0zE2KHI/edit?usp=drive_web&ouid=106425005676808098789&rtpof=true) + (this is an Excel spreadsheet since you need to upload it and it needs to be a + file and not a Google Sheet). - Each metric is scored between 1 (poor), 3 (average) and 5 (excellent) - - - We consider 4 as acceptable, anything less than 4 as problematic and needs improve + - We consider 4 as acceptable, anything less than 4 as problematic and needs + improve - We want to score everyone we work with: - - Initially only people that we supervise, later on anyone - Feedback is anonymous - At least 2 persons should score everyone - Scoring frequency - - Every 2 weeks for full-time candidates, part-time collaborators - Every month for full-time team ### Current process - Every scoring needs to happen (e.g., every two weeks): - - - Mentor make a copy of the Excel spreadsheet [Scoring template](https://docs.google.com/spreadsheets/u/2/d/1WsWT8By2hr1VqB6ulIXf3_Rfa0zE2KHI/edit?usp=drive_web&ouid=106425005676808098789&rtpof=true) - - Rename the template "Scoring - {Scorer} - {ScoringDate}" (e.g., "Scoring - GP - 2023-09-01") - - Fill out the rows for the people that they need to score by looking at the Mentor column - - Upload your Scoring Excel file [here](https://docs.google.com/forms/d/e/1FAIpQLSdXhjHo52Roz_ROY-zlkg0YPMHCzoDXmPpCd1x-KmeCtQVd5g/viewform) + - Mentor make a copy of the Excel spreadsheet + [Scoring template](https://docs.google.com/spreadsheets/u/2/d/1WsWT8By2hr1VqB6ulIXf3_Rfa0zE2KHI/edit?usp=drive_web&ouid=106425005676808098789&rtpof=true) + - Rename the template "Scoring - {Scorer} - {ScoringDate}" (e.g., "Scoring - + GP - 2023-09-01") + - Fill out the rows for the people that they need to score by looking at the + Mentor column + - Upload your Scoring Excel file + [here](https://docs.google.com/forms/d/e/1FAIpQLSdXhjHo52Roz_ROY-zlkg0YPMHCzoDXmPpCd1x-KmeCtQVd5g/viewform) - You should see - - - (For admin use, the source is [here](https://docs.google.com/forms/d/1IXpcMSrtVI0xO3eNMrzGNJ0zpv-KySQVXjujlSuZlpo/edit) - and [here](https://drive.google.com/drive/u/1/folders/1r-npms62yEvO90bXq8yZ99MkQk21c6SrQ5PpT1KqLpF1cnUqwJgO8E7cuD2t6zZe2P3hwjbe)) - - - One of the integrators (GP, Paul, or somebody else) merges all the scoring template in a single one, and then creates the averaged score for each person + - (For admin use, the source is + [here](https://docs.google.com/forms/d/1IXpcMSrtVI0xO3eNMrzGNJ0zpv-KySQVXjujlSuZlpo/edit) + and + [here](https://drive.google.com/drive/u/1/folders/1r-npms62yEvO90bXq8yZ99MkQk21c6SrQ5PpT1KqLpF1cnUqwJgO8E7cuD2t6zZe2P3hwjbe)) + - One of the integrators (GP, Paul, or somebody else) merges all the scoring + template in a single one, and then creates the averaged score for each + person - The scores are then distributed anonymously - - Scored team members don't know who / how many mentors scored them (although they have a clue about at least one mentor) + - Scored team members don't know who / how many mentors scored them + (although they have a clue about at least one mentor) ## Scoring topics @@ -65,14 +74,13 @@ - Topics should be independent - We should provide - - Concrete questions to assess how people do on each topic - - Ways to improve the score (e.g., "read this book!", "do more of this and less of this") + - Ways to improve the score (e.g., "read this book!", "do more of this and + less of this") ### Current topics - Scoring table contains the following fields: - - Quality of code - Writes elegant code? - Follows our standards and conventions? @@ -123,19 +131,20 @@ - Positive energy - Has an upbeat approach to working even if sh\*t doesn't work (since things never work)? - - Is a [Negative Nelly](https://www.urbandictionary.com/define.php?term=negative%20nelly)? + - Is a + [Negative Nelly](https://www.urbandictionary.com/define.php?term=negative%20nelly)? - Dev %, Data scientist %, Devops % - This just measures how much of a role one team member can cover - See below ### Roles -- We want to define how each team-member is comfortable covering several high level activities. +- We want to define how each team-member is comfortable covering several high + level activities. - The idea is to understand what roles a new hire can play. - Roles are not mutually exclusive - - E.g., a jack-of-all-trades can be 4 on all topics - E.g., XYZ is a data scientist and has data science=5, dev=3, devops=1 - Data science @@ -187,4 +196,4 @@ - This is a minor metric: the number of hours doesn't really matter as long as stuff is done - On the other hand, if somebody consistently doesn't put enough time to get - the needed stuff done, it can become a problem \ No newline at end of file + the needed stuff done, it can become a problem diff --git a/docs/work_organization/all.datapull_DAGmeister.how_to_guide.md b/docs/work_organization/all.datapull_DAGmeister.how_to_guide.md new file mode 100644 index 0000000000..9b88d87b8c --- /dev/null +++ b/docs/work_organization/all.datapull_DAGmeister.how_to_guide.md @@ -0,0 +1,76 @@ +# Datapull Dagmeister + +## DataPull DagMeister process + + + +- [General](#general) +- [Notification system](#notification-system) +- [DagMeister instructions](#dagmeister-instructions) + + + +## General + +- The DagMeister rotates every 2 weeks + - To see who is the DagMeister now refer to + [DataPull_DagMeister gsheet](https://docs.google.com/spreadsheets/d/1Ab6a3BVeLX1l1B3_A6rNY9pHRsofeoCw2ip2dkQ6SdA/edit#gid=0) + - Each rotation should be confirmed by a 'handshake' between the outgoing + DagMeister and the new one in the related Telegram chat + `Kaizen Preprod Datapull Notifications` + - Transfer the assignee of + [#8785](https://github.com/cryptokaizen/cmamp/issues/8785) to new DagMeister +- The DagMeister is responsible for: + - Check the Telegram channel for any failures from preprod DAGs. + - Raising the issue on Github for that failure by debugging the root cause of + the failure. + - If the issue is already raised, comment the link of the failure in the + issue citing same reason. + - All issues should come under single epic + [#8785](https://github.com/cryptokaizen/cmamp/issues/8785) + - Tag team leader in the issue to confirm if the issue needs to be fixed with + highest priority or not. + - All the failures from region `tokyo` are of highest priority and needs to be + resolved ASAP. + +## Notification system + +- `@CK_Airflow_bot` notifies the team about breaks via Telegram channel + `Kaizen Preprod Datapull Notifications` +- A notification contains: + - DAG start timestamp + - Link fo broken DAG + +## DagMeister instructions + +- You receive a break notification from `@CK_Airflow_bot` +- Have a look at the message + - Do it right away, this is always your highest priority task +- Notify the team + - If the break happened in `tokyo` region for `bid_ask` or `OHLCV` DAGs ping + the channel by tagging the team leader. + - Reply on the failure to notify you are already looking into this. + - After the issue is raised reply back with the issue number. + - There could be multiple failure due to the same reason so just reply with + same issue number. + +- File an Issue in GH / ZH to report the failing tests and the errors + - Paste the URL of the failing run + - Example: [#9110](https://github.com/cryptokaizen/cmamp/issues/9110) + - Provide as much information as possible to give an understanding of the + problem + - Stack trace or part of it (if it's too large) + - Paste the link of QA notebook if QA failed. + - Add the issue to the + [DATAPULL- Fix failing DAGs](https://github.com/cryptokaizen/cmamp/issues/8785) + Epic so that we can track it + +- Fixing the issue + - If the bug is obvious and can be fixed easily. Fix it with highest priority. + - If fixing will require debugging time, tag the team leader to ask for + priority. + - IMPORTANT: Disabling a DAG is not the first choice, it's a measure of last + resort! and should oly be done after the approval from the team leader. + +- When your time of the DAGMeister duties is over, confirm the rotation with the + next responsible person in the related Telegram chat. diff --git a/docs/work_organization/all.epicmeister.how_to_guide.md b/docs/work_organization/all.epicmeister.how_to_guide.md index 5613717a4d..373620f387 100644 --- a/docs/work_organization/all.epicmeister.how_to_guide.md +++ b/docs/work_organization/all.epicmeister.how_to_guide.md @@ -1,4 +1,6 @@ -# EpicMeister Process +# Epicmeister + +## EpicMeister Process @@ -9,34 +11,37 @@ -# General +## General - EpicMeister ensures that Epics and issues are well-organized and updated, providing the team with a clear overview of project progress and priorities - By assigning Epics to an issue, the EpicMeister establishes a clear relationship between the larger project goals and the specific tasks, facilitating a holistic view of project progress and alignment -- Refer to this [doc](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md) to have a clear - understanding of the workflow using GitHub and ZenHub +- Refer to this + [doc](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md) + to have a clear understanding of the workflow using GitHub and ZenHub -# Responsibilities +## Responsibilities -## Epic Management +### Epic Management -- Keep this [document](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md#list-of-epics) that lists - all existing Epics updated +- Keep this + [document](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md#list-of-epics) + that lists all existing Epics updated - When a new Epic is required, after discussion - Create it within ZenHub - - Make sure the Epics are alphabetically organized on the ZenHub by right-click - and select to sort them - - Update the [document](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md) + - Make sure the Epics are alphabetically organized on the ZenHub by + right-click and select to sort them + - Update the + [document](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md) - Provide a concise title that reflects the nature of the Epic - Craft a description that outlines the goals and scope of the Epic - Every two weeks create a checklist of the team members in - [this](https://github.com/cryptokaizen/cmamp/issues/5668) issues to make sure team - members are cleaning up the board. + [this](https://github.com/cryptokaizen/cmamp/issues/5668) issues to make sure + team members are cleaning up the board. -## Issue Organization +### Issue Organization - Ensure that all Issues in GitHub/ZenHub are well-organized - Each Issue should: diff --git a/docs/work_organization/all.rollout.how_to_guide.md b/docs/work_organization/all.rollout.how_to_guide.md index cea5b5c47a..f0e3afe879 100644 --- a/docs/work_organization/all.rollout.how_to_guide.md +++ b/docs/work_organization/all.rollout.how_to_guide.md @@ -1,4 +1,4 @@ - +# Rollout @@ -8,7 +8,7 @@ -# Roll-out process +## Roll-out process - Implement - Prepare documentation @@ -23,7 +23,7 @@ - The assignee is the person in charge of making sure the rollout is done - Send an ORG email with the same content of the Issue -# Roll-out documentation +## Roll-out documentation - A roll-out should address the following points: - Short summary @@ -34,15 +34,15 @@ - Why is it important - Whom to ask for help -# An example of roll-out email +## An example of roll-out email ``` Hello team, -## Intended audience +### Intended audience Anybody using Jupyter notebooks -## What it is about +### What it is about - `publish_notebook.py` is a little tool that allows to: 1. Opening a notebook in your browser (useful for read-only mode) @@ -63,7 +63,7 @@ Anybody using Jupyter notebooks You can get details by running: `dev_scripts/notebooks/publish_notebook.py -h` -## What you need to do +### What you need to do Please update your branches from the `master` for all the submodules. @@ -71,7 +71,7 @@ You can use our shortcut: > make git_pull -## What has changed +### What has changed We've deployed the new service for storing notebooks in HTML format. From now on `publish_notebook.py` will work from the Docker container. The new version of @@ -82,6 +82,6 @@ won't work from now on, we need to replace them with the new ones (http://notebook-keeper.p1/...) If you see any link starts with http://research:8077 replace them with http://notebook-keeper.p1 . -## Reference documentation +### Reference documentation //amp/docs/coding/all.publish_notebook.how_to_guide.md ``` diff --git a/docs/work_organization/all.scrum.explanation.md b/docs/work_organization/all.scrum.explanation.md index 23324c34bc..8ff85880e5 100644 --- a/docs/work_organization/all.scrum.explanation.md +++ b/docs/work_organization/all.scrum.explanation.md @@ -1,5 +1,9 @@ -# Scrum Methodology +# Scrum + +## Scrum Methodology + + - [Roles](#roles) * [Goal of Scrum methodology](#goal-of-scrum-methodology) * [Metaphor for the roles in terms of a race car](#metaphor-for-the-roles-in-terms-of-a-race-car) @@ -23,19 +27,20 @@ * [Sprint review](#sprint-review) * [Sprint retrospective](#sprint-retrospective) * [Sprint retrospective: questions](#sprint-retrospective-questions) + - From "Lacey, The Scrum Field Guide: Practical Advice for Your First Year, 2012" -# Roles +## Roles -## Goal of Scrum methodology +### Goal of Scrum methodology - Work in the interests of customers and stakeholders to turn the vision into a working product -## Metaphor for the roles in terms of a race car +### Metaphor for the roles in terms of a race car - ProductOwner = driver @@ -43,7 +48,7 @@ - ScrumMaster = lubricants and sensors -## ScrumMaster +### ScrumMaster - Identify when the team is not performing to its ability @@ -55,7 +60,7 @@ - Can build trust and earn respect -## ProductOwner +### ProductOwner - Represent the customers @@ -68,71 +73,58 @@ - Ultimately he is responsible for success or failure of the projects - Decide: - - What is developed - - When it is developed - - Whether the product meets expectations -## DevTeam +### DevTeam - Aka Team, Development team, Core team - Developers, testers, architects, designers - - Cross-functionality is a good thing - The ideal team size is 6 plus / minus 2 -# Artifacts +## Artifacts -## Product backlog +### Product backlog - = master list of all features and functionalities needed to implement the vision into the product - The ProductOwner keeps the backlog: - - Prioritized - - Up to date - - Clear - The backlog is never complete: - - Items are added and removed - - Reordered based on priority, value, or risk -## Product backlog items +### Product backlog items - Aka PBI - E.g., bugs, features, enhancements, non-functional requirements -## Complexity of PBI +### Complexity of PBI - ProductOwner and the DevTeam estimate the size of each task - The complexity of each task can be expressed in different ways: - - Points - - T-shirt size (S, M, L, XL) -## High-priority vs lower-priority tasks +### High-priority vs lower-priority tasks - High-priority stories should be small and clear - - So they can be brought into the sprint - Lower-priority items can be large and fuzzy - - Bigger stories are decomposed into smaller chunks -## Sprint backlog +### Sprint backlog - = output of the planning meeting @@ -143,14 +135,11 @@ - The DevTeam keeps the sprint backlog up to date - During a sprint - - New tasks are discovered - - Tasks are adjusted (in terms of description or estimated hours) - - Tasks are marked as done -## The burndown +### The burndown - Communicate how much work is remaining and what is the team velocity @@ -159,20 +148,17 @@ - Plot the number of hours remaining (y-axis) against the number of days remaining (x-axis) -# The meetings +## The meetings -## Planning meeting +### Planning meeting - Each sprint begins with a sprint planning attended by the team, ScrumMaster, ProductOwner - - Typically one needs two hours per number of weeks to plan the sprint - - For a 1-month sprint, 8 hours of meeting - - For 2-week sprint, 4 hours of meeting -## Part one of sprint planning meeting +### Part one of sprint planning meeting - Review of potential product backlog items for the sprint @@ -182,7 +168,7 @@ - Outcome is one-sentence description of the desired outcome of the sprint -## Part two of sprint planning meeting +### Part two of sprint planning meeting - Many DevTeams discuss how to implement the tasks @@ -196,71 +182,56 @@ - Estimate tasks in terms of hours -## Daily scrum +### Daily scrum - Aka daily stand-up - Give the DevTeam the opportunity to sync daily, at the same time, and at the same place -## Daily scrum: questions +### Daily scrum: questions - The 3 most frequent questions are: - - What have you accomplished since the last meeting? - - What will you accomplish today? - - What obstacles are in your way? -## What the daily scrum is not +### What the daily scrum is not - The daily scrum is not a deep-dive problem-solving meeting - - Any other issues need to be taken offline - It is not a status report meeting to the ScrumMaster - - The purpose is for the DevTeam members to talk to each other - The ProductOwner is in "listen-only" mode -## Sprint review +### Sprint review - On the last day of the sprint, the DevTeam holds a sprint review - Everybody should join - - ScrumMaster - - ProductOwner - - DevTeam - - Customers, key stakeholders - - Executives - DevTeam - - Recaps the goal of the sprint - - Presents the work done - Customers - - Review the progress made on the project - - Accept changes - - Ask for changes -## Sprint retrospective +### Sprint retrospective - After the sprint review, the retrospective is a way to identify how to improve process and execution -## Sprint retrospective: questions +### Sprint retrospective: questions - What went well during the sprint? diff --git a/docs/work_organization/all.team_collaboration.how_to_guide.md b/docs/work_organization/all.team_collaboration.how_to_guide.md index 7a038e93fb..bb8531f47c 100644 --- a/docs/work_organization/all.team_collaboration.how_to_guide.md +++ b/docs/work_organization/all.team_collaboration.how_to_guide.md @@ -1,4 +1,6 @@ -# General Rules of Collaboration +# Team Collaboration + +## General Rules of Collaboration @@ -28,7 +30,7 @@ -# Ask somebody if you have any doubts +## Ask somebody if you have any doubts - If you have doubts on how to do something you want to do: - Look in the @@ -45,7 +47,7 @@ careful and always think for yourself - Don't hesitate to ask anyone, even GP & Paul -# Ping Team Leaders when you are out of tasks +## Ping Team Leaders when you are out of tasks - When you're close to being out of tasks or all your ongoing PRs are waiting for review and are close to being merged, feel free to ping us in the Telegram @@ -55,18 +57,18 @@ - The goal is for everyone to have 2 issues to work on at the same time to avoid getting blocked on us -# Collaboration +## Collaboration -## Why do we need to follow this handbook? +### Why do we need to follow this handbook? -### Learning from each other +#### Learning from each other - Proper research and software engineering practices allow us to: - Learn from each other - Accumulate and distill the wisdom of experts - Share lessons learned from our mistakes along the way -### Consistency and process +#### Consistency and process - Consistency is a crucial enabler to make teams faster - Productivity increases when team members "work in the same way", i.e., there @@ -82,12 +84,12 @@ - We are not going to discuss and debate the rationale, but instead assume the above as self-evident truth -## Sync-ups +### Sync-ups - We meet regularly every week and with different audiences to check on the progress of the many projects we work on -### All-hands meetings +#### All-hands meetings - All-hands meeting on Mondays has the following goals: - Summarize ongoing projects and their status @@ -97,7 +99,7 @@ - E.g., organization, process - Talk about the team, hiring, customers -### Technical sync-ups +#### Technical sync-ups - We meet one or two times per week for each of the projects (e.g., IM, WEB3) - Please check your calendar to make sure the times work and the invited @@ -114,14 +116,14 @@ there is not much to discuss - We don't have to fill one hour every time -### Ad-hoc meetings +#### Ad-hoc meetings - Don't hesitate to ask for a quick meeting if you are unsure about: - What exactly needs to be done in a GitHub Issue - How to set-up something (e.g., environment, docker) - Better safe than sorry -### Org emails +#### Org emails - GP & Paul may send emails with the subject starting with "ORG:" pointing to interesting docs that are of general interest and relevance @@ -129,7 +131,7 @@ do and, especially, the rationale of the proposed solutions - It's ok to acknowledge the email replying to `all@kaizen-tech.io` -### Synchronization point +#### Synchronization point - We understand that most of the time everybody is head-down making progress on their tasks @@ -147,7 +149,7 @@ - Everybody does what's asked - Mark on the GitHub task your name -### Morning TODO email +#### Morning TODO email The idea is to send a morning TODO email to broadcast: @@ -274,9 +276,9 @@ If you are a collaborator or intern, follow the steps to join the mailing group request. It should look like this: ![alt_text](figs/team_collaboration/3_Request.png) -## Communication +### Communication -### Use the right form of communication +#### Use the right form of communication - GitHub - This is a major form of communication about technical details, so if you @@ -322,26 +324,26 @@ If you are a collaborator or intern, follow the steps to join the mailing group https://forms.gle/KMQgobqbyxhoTR9n6 - The question will be discussed at the all hands meeting -### DRY also applies to documentation +#### DRY also applies to documentation - [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself)! Do not Repeat Yourself - E.g., it's not a good idea to cut & paste pieces of Gdocs in a GitHub bug, rather just point to the relevant session on Gdocs from the GitHub bug -### Avoid write-once code and research +#### Avoid write-once code and research - Code and research is: - Written once by a few people - Read many times by many people - Therefore it is essential to invest in the process of writing it heavily -### Consistency +#### Consistency - Coding/research across our group is done with consistent procedures, code layout, and naming conventions -### Training period +#### Training period - When you start working with us, you need to go through a period of training in following the procedures and conventions described in this handbook @@ -361,14 +363,14 @@ If you are a collaborator or intern, follow the steps to join the mailing group but months if you resist or treat it as an afterthought - Our suggestion is to accept these rules as the existence of gravity -### Go slowly to go faster +#### Go slowly to go faster - Once you reach proficiency, you will be moving much faster and make up for the invested time - In fact, everyone will be much quicker, because everyone will be able to look at any part of the codebase or any notebook and get oriented quickly -## Vacations/OOTO time +### Vacations/OOTO time - We use [vacation calendar](https://calendar.google.com/calendar/u/0?cid=Y19kYWRlOGU0NTUwMzhiMDllMmUzNDk1OWM2YzFkYWNhYTVmMTAzYjdjZmNiODQ1MDkzOWZhMTBkZDY2NWI3ZjJhQGdyb3VwLmNhbGVuZGFyLmdvb2dsZS5jb20) @@ -378,7 +380,7 @@ If you are a collaborator or intern, follow the steps to join the mailing group - Create an event in it, whenever you have planned time off in order to let your colleagues know in advance -## Improve your English! +### Improve your English! - Make sure you have English checker in all your tools: - Pycharm: you can use @@ -393,7 +395,7 @@ If you are a collaborator or intern, follow the steps to join the mailing group are not sure about a word or a phrase - What's the point of doing an excellent job if you can't communicate it? -### Study an English grammar book +#### Study an English grammar book - I used [this](https://www.amazon.com/English-Grammar-Use-Self-study-Intermediate/dp/0521189063/ref=sr_1_3?ie=UTF8&qid=1536765989&sr=8-3&keywords=English+Grammar+in+Use) diff --git a/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md b/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md index 816049a2b1..7a38e941d8 100644 --- a/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md +++ b/docs/work_organization/all.use_github_and_zenhub.how_to_guide.md @@ -1,3 +1,5 @@ +# Use Github And Zenhub + - [Introduction](#introduction) @@ -15,6 +17,7 @@ * [Issue Estimate](#issue-estimate) * [PR](#pr) - [Issue workflows](#issue-workflows) + * [Naming an Issue](#naming-an-issue) * [Filing a new issue](#filing-a-new-issue) * [Updating an issue](#updating-an-issue) * [Closing an issue](#closing-an-issue) @@ -29,7 +32,7 @@ -# Introduction +## Introduction - In the following we use the abbreviations below: - GH = GitHub @@ -49,9 +52,9 @@ - Please install the [ZH extension](https://www.zenhub.com/extension) for GH, since it is going to make your life easier -# Concepts +## Concepts -## Sprints +### Sprints - Sprints are weekly, Monday - Friday and consist of the Issues worked on during the week @@ -65,8 +68,8 @@ - Each week's Sprint has Issues added to it by Team Leaders before Monday's work begins - Every Issue in a Sprint should have - - a point estimate - - an Epic + - A point estimate + - An Epic - The Team Member working on the Issue sets the point estimate by themselves or together with the Team Leader - Each sprint should have limits based on the estimates. E.g., a sprint cannot @@ -76,7 +79,7 @@ sacrificing other Issues in the Sprint, the point is to make the trade-off apparent -## Epics +### Epics - Epics are thematic groups of Issues that are somehow related by their topic - It may take multiple Sprints to complete all the Issues in an Epic @@ -93,7 +96,7 @@ description - We keep the Epics alphabetized on the board under the pipeline "EPICs" -### Master Epics +#### Master Epics - Master Epics are long-running Epics (i.e., projects) - E.g., `WEB3` @@ -103,7 +106,7 @@ - There is no need to add an Issue to a Master Epic if it is already added to a sub-Epic, since this is handled automatically by ZH -### Sub-Epics +#### Sub-Epics - Master Epics can be broken down into smaller Epics, called "sub-Epics" - E.g., `WEB3 - DaoCross v0.1` @@ -118,10 +121,9 @@ - Sub-Epics should belong to a Master Epic in ZH so that querying by Epic and sub-epics is simplified -### List of Epics +#### List of Epics - Below there is a list of the Epics and sub-Epics - - For simplicity we keep the information about the Epics here, instead of ZenHub @@ -247,7 +249,7 @@ - `Tulip` - `Utils` -## Issue +### Issue - Each Issue is a piece of work to be done - Issues are combined into Epics by topic @@ -258,7 +260,7 @@ - An issue might not have an assignee and estimate if it is not inside an epic but before execution of course it needs to be resolved -## Milestone +### Milestone - Milestone consist of group of tasks we want to accomplish during certain period @@ -272,7 +274,7 @@ - Not having a start and end date to an Epic is fine if it does not belong to the current milestone -## Label +### Label - Labels are attributes of an issue (or PR), e.g., `good first issue`, `PR_for_reviewers`, `duplicate`, etc. @@ -281,7 +283,7 @@ [cmamp](https://github.com/cryptokaizen/cmamp/labels) - The repos should always have labels in sync -### List of labels +#### List of labels - `Blocking`: This issue needs to be worked on immediately - `Bug`: Something isn't working @@ -306,7 +308,7 @@ - `To close`: An issue can be potentially closed > TODO(gp): -> To_close -## Pipeline +### Pipeline - A ZH Pipeline represents the "progress" status of an Issue in our process - We have the following Pipelines on the ZH board: @@ -348,7 +350,7 @@ stateDiagram Done --> [*] ``` -## Issue Estimate +### Issue Estimate - The Issue estimates ranges from 1 to 5: - 1 (e.g., a function rename, updating the entire code base and the unit @@ -358,24 +360,24 @@ stateDiagram - 4 (e.g., implement a new feature, where the solution is clear in advance) - 5 (e.g., implement a new feature, where the solution is complex) -## PR +### PR - A pull request is an event where a contributor asks to review code they want to merge into a project -# Issue workflows +## Issue workflows -## Naming an Issue +### Naming an Issue - Use an informative description, typically in the form an action - E.g., "Do this and that" - We don't use a period at the end of the title -- We prefer to avoid too much capitalization to make the Issue title easy to read - and for consistency with the rest of the bugs - +- We prefer to avoid too much capitalization to make the Issue title easy to + read and for consistency with the rest of the bugs + **Good** ``` - Optimize Prometheus configuration for enhanced Kubernetes monitoring + Optimize Prometheus configuration for enhanced Kubernetes monitoring ``` **Bad** @@ -385,7 +387,7 @@ stateDiagram - They are equivalent, but the first one is more readable -## Filing a new issue +### Filing a new issue - If it is a "serious" problem (bug) put as much information about the Issue as possible, e.g.,: @@ -436,7 +438,7 @@ stateDiagram - If you are unsure then you can leave it empty, but `@tag` Integrator / team leaders to make sure we can re-route and improve the Epics/Labels -## Updating an issue +### Updating an issue - For large or complex Issues, there should be a design phase (in the form of GH Issue, Google Doc, or design PR) before starting to write a code @@ -457,7 +459,7 @@ stateDiagram - If we decide to stop the work, add a `Paused` label and move it back to the backlog, e.g., `Sprint backlog (P0)`, `Product backlog (P1)`, `Icebox (P2)` -## Closing an issue +### Closing an issue - A task is closed when PR has been reviewed and merged into `master` - When, in your opinion, there is no more work to be done on your side on an @@ -475,9 +477,9 @@ stateDiagram - E.g. - closing as PR is merged - E.g. - closing since obsolete -# PR workflows +## PR workflows -## PR labels +### PR labels - `PR_for_authors` - There are changes to be addressed by an author of a PR @@ -487,9 +489,9 @@ stateDiagram - PR is ready for the final round of review by Integrators, i.e. close to merge -## Filing a new PR +### Filing a new PR -### General tips +#### General tips - Implement a feature in a branch (not `master`), once it is ready for review push it and file a PR via GH interface @@ -502,13 +504,12 @@ stateDiagram - If you want to make sure you are going in a right direction or just to confirm the interfaces you can also file a PR to discuss - Mark PR as draft if it is not ready, use the `convert to draft` button - - Draft PR should be filed when there is something to discuss with and demonstrate to the reviewer, but the feature is not completely implemented -### Filing process +#### Filing process - Add a description to help reviewers to understand what it is about and what you want the focus to be @@ -531,7 +532,7 @@ stateDiagram - Attach a command line to open a published notebook, see [here](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_tools/all.development.how_to_guide.md#open-a-published-notebook) -## Review +### Review - A reviewer should check the code: - Architecture @@ -548,7 +549,7 @@ stateDiagram - Pass it to integrators and mark it as `PR_for_integrators` - Usually is placed by team leaders after they approve PR -## Addressing comment +### Addressing comment - If the reviewer's comment is clear to the author and agreed upon: - The author addresses the comment with a code change and after changing the @@ -561,7 +562,7 @@ stateDiagram - Re-request the review - Mark it as `PR_for_reviewers` -## Coverage reports in PRs - discussion +### Coverage reports in PRs - discussion - We should start posting coverage reports in PRs. diff --git a/docs/work_tools/all.bfg_repo_cleaner.how_to_guide.md b/docs/work_tools/all.bfg_repo_cleaner.how_to_guide.md index 62e9223b88..dc3e4a2a81 100644 --- a/docs/work_tools/all.bfg_repo_cleaner.how_to_guide.md +++ b/docs/work_tools/all.bfg_repo_cleaner.how_to_guide.md @@ -1,3 +1,13 @@ + + + + +- [BFG Repo-Cleaner](#bfg-repo-cleaner) + * [Build](#build) + * [Usage](#usage) + + + # BFG Repo-Cleaner [BFG](https://rtyley.github.io/bfg-repo-cleaner/) dockerized. @@ -19,9 +29,10 @@ docker run -it --rm \ --delete-files id_rsa ``` -You could make this command more easily accessible by putting it in an executable, -and make sure that it is available in your `$PATH`. Alternatively, you could create -wrapper functions for your `docker run` commands ([example](https://github.com/jessfraz/dotfiles/blob/master/.dockerfunc)). +You could make this command more easily accessible by putting it in an +executable, and make sure that it is available in your `$PATH`. Alternatively, +you could create wrapper functions for your `docker run` commands +([example](https://github.com/jessfraz/dotfiles/blob/master/.dockerfunc)). ```bash bfg() { diff --git a/docs/work_tools/all.chatgpt_api.how_to_guide.md b/docs/work_tools/all.chatgpt_api.how_to_guide.md index da7714d855..6c9cbfbc45 100644 --- a/docs/work_tools/all.chatgpt_api.how_to_guide.md +++ b/docs/work_tools/all.chatgpt_api.how_to_guide.md @@ -1,10 +1,9 @@ - +# Chatgpt Api - [OpenAI Assistant Runner & Manager](#openai-assistant-runner--manager) * [What is OpenAI Assistant](#what-is-openai-assistant) - * [Why using Assistant](#why-using-assistant) * [General pattern](#general-pattern) * [Code organization](#code-organization) * [How to use](#how-to-use) @@ -12,17 +11,17 @@ + [Running Assistant](#running-assistant) - [API library](#api-library) * [Usage](#usage) - + [File Structure](#file-structure) - + [Uploading and Retrieving Files](#uploading-and-retrieving-files) + + [File structure](#file-structure) + + [Uploading and retrieving Files](#uploading-and-retrieving-files) + [Managing Assistants](#managing-assistants) - + [ChatGPT Communication](#chatgpt-communication) - + [E2E Assistant Runner](#e2e-assistant-runner) + + [ChatGPT communication](#chatgpt-communication) + + [E2E assistant runner](#e2e-assistant-runner) -# OpenAI Assistant Runner & Manager +## OpenAI Assistant Runner & Manager -## What is OpenAI Assistant +### What is OpenAI Assistant - An assistant is similar to a modified GPT that has mastered some knowledge and is able to use that knowledge for future tasks @@ -35,14 +34,15 @@ of the conversation grows - By creating an Assistant, you build a new "instance" of ChatGPT and can give it some knowledge to learn -- This knowledge can be in many formats and up to 20 files (512MB each) at a time +- This knowledge can be in many formats and up to 20 files (512MB each) at a + time - With an instruction string, you define its behavior about how it should make use of those knowledge - When talking to an assistant, you can still add files in the message - These files does not count towards its 20 files' knowledge limit, as they are considered as input and will be forgotten eventually -## General pattern +### General pattern - Creation: - Send some guideline or example files for one type of task to the Assistant @@ -60,7 +60,7 @@ - Chatting is not yet implemented in our code, since command line scripts cannot save conversations. -## Code organization +### Code organization - Libraries are under `helpers`, e.g., - `helpers/hchatgpt.py` @@ -70,7 +70,8 @@ - `dev_scripts/chatgpt/manage_chatgpt_assistant.py` - `dev_scripts/chatgpt/run_chatgpt.py` -## How to use +### How to use + - Add the API KEY ```bash > export OPENAI_API_KEY= @@ -79,7 +80,7 @@ - Assistants are organization-wide, an assistant created under our Org can be accessed by any API key that belongs to our Org -### Assistant Manager +#### Assistant Manager - The interface is like: @@ -124,7 +125,7 @@ --retrieval_tool --code_tool ``` -### Running Assistant +#### Running Assistant - The script `dev_scripts/chatgpt/run_chatgpt.py` runs an assistant @@ -162,7 +163,7 @@ -o dev_scripts/chatgpt/example_data/gpt_linted_dropcontact.how_to_guide.md # Redirect its output to this file ``` -# API library +## API library - `helpers/hchatgpt.py` provides methods that wrap and interact with OpenAI API - By using these methods, you can easily build an assistant and chat to it with @@ -175,11 +176,11 @@ - Running threads with certain assistants - End-to-end communication method between users and the assistant -## Usage +### Usage The following snippets provide a basic overview of the code usage. -### File structure +#### File structure - Since OpenAI File manager does not hold folder structure, you use a cache dictionary to save the relation between our file (with folder) and OpenAI File @@ -189,7 +190,7 @@ The following snippets provide a basic overview of the code usage. - If you find anything buggy, try deleting this cache file and rerun the code so that it can be regenerated from scratch -### Uploading and retrieving Files +#### Uploading and retrieving Files - To upload a file to OpenAI, which you can later attach to messages/assistants: @@ -207,7 +208,7 @@ The following snippets provide a basic overview of the code usage. file_object = get_gpt_file_from_id(file_id) ``` -### Managing Assistants +#### Managing Assistants You can specify files an assistant should constantly use (like guidelines): @@ -222,7 +223,7 @@ add_files_to_assistant_by_name('assistant_name', ['new_file_path']) delete_file_from_assistant_by_name('assistant_name', 'file_path_to_remove') ``` -### ChatGPT communication +#### ChatGPT communication - Create a thread and send a message, with or without attaching files: @@ -239,7 +240,7 @@ delete_file_from_assistant_by_name('assistant_name', 'file_path_to_remove') response_messages = wait_for_run_result(thread_id, run_id) ``` -### E2E assistant runner +#### E2E assistant runner - Interact with an assistant conveniently with the `e2e_assistant_runner` function diff --git a/docs/work_tools/all.codebase_clean_up.how_to_guide.md b/docs/work_tools/all.codebase_clean_up.how_to_guide.md index 2ac0d5b102..f0e2ee1328 100644 --- a/docs/work_tools/all.codebase_clean_up.how_to_guide.md +++ b/docs/work_tools/all.codebase_clean_up.how_to_guide.md @@ -1,4 +1,4 @@ - +# Codebase Clean Up @@ -19,9 +19,9 @@ -# Codebase clean-up scripts +## Codebase clean-up scripts -## Problem +### Problem 1. Since we have multiple repos, we can't always easily replace code in one repo (e.g., with PyCharm) and have all the other repos work properly @@ -35,7 +35,7 @@ - The reviewers might ask some changes - This creates a lot of manual changes -## Solution: script approach +### Solution: script approach - Create a shell `sh` script that applies the correct changes to all the repos using [/dev_scripts/replace_text.py](/dev_scripts/replace_text.py) @@ -49,7 +49,7 @@ 3. We can check out a clean master, run the script to apply the changes automatically, regress and merge -## Using the script approach +### Using the script approach - We want to apply clean-up changes to the code base with a script @@ -70,7 +70,7 @@ - The author/reviewers should run the script on all the repos, run the unit tests, and merge (through a PR as usual) -# How to use `replace_text.py` +## How to use `replace_text.py` - See `-h` for updated list of options @@ -93,7 +93,7 @@ the files with extensions `.py`, `.ipynb`, `.txt`, `.md` and to do a `git mv` for files based on certain criteria -## Rename a file +### Rename a file - Preview the change ```bash @@ -111,7 +111,7 @@ > git mv ./dataflow/backtest/{research_backtest_utils.py,backtest_api.py} ``` -## Replace an import with a new one +### Replace an import with a new one ```bash > replace_text.py \ @@ -119,7 +119,7 @@ --new "import core.finance" ``` -## Replace text in a specific directory +### Replace text in a specific directory ```bash > replace_text.py \ @@ -130,7 +130,7 @@ --exts None ``` -## Revert all files but this one +### Revert all files but this one - There is an option `--revert_all` to apply this before the script ```bash @@ -141,7 +141,7 @@ xargs git checkout -- ``` -## Custom flows +### Custom flows ```bash > replace_text.py --custom_flow _custom1 @@ -152,7 +152,7 @@ > replace_text.py --custom_flow _custom2 --revert_all ``` -# Usage examples +## Usage examples - See [SorrIssue259](https://github.com/sorrentum/sorrentum/issues/259) and the related [PR](https://github.com/sorrentum/sorrentum/pull/336) for reference @@ -176,7 +176,7 @@ - Of course the changes need to be applied in one repo and then propagated to all the other repos if the tests are successful -## Instructions for the PR author +### Instructions for the PR author - Create a local branch called `...TaskXYZ_..._script` containing: - The code that needs to be changed manually @@ -202,7 +202,7 @@ automate as much as possible - Finally, the PR author merges the PR with the results of the script -### Example +#### Example - The name of script should be related to the task. E.g: `SorrTask259_Make_to_multi_line_cmd_public.sh` @@ -210,7 +210,7 @@ required functionality as provided in the above examples - Create a PR only with the script and the changes -## Instructions for the subrepo integrator +### Instructions for the subrepo integrator - Do a `git checkout` of the `...TaskXYZ_..._script` - Run the script diff --git a/docs/work_tools/all.conda_environment_obsolete.how_to_guide.md b/docs/work_tools/all.conda_environment_obsolete.how_to_guide.md index eac91c1dbf..07b1f73cc2 100644 --- a/docs/work_tools/all.conda_environment_obsolete.how_to_guide.md +++ b/docs/work_tools/all.conda_environment_obsolete.how_to_guide.md @@ -1,4 +1,4 @@ - +# Conda Environment Obsolete @@ -28,16 +28,16 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ -# Conda flow +## Conda flow -## (optional) Install anaconda +### (optional) Install anaconda - For the AWS machine there is already a central conda, so there is no need for users to install - For a laptop you need to install it yourself - You need _anaconda3_ -## Configure anaconda +### Configure anaconda - Configure anaconda for your shell using: ```bash @@ -45,7 +45,7 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ ``` - Anaconda3 adds a snippet of code in your `.bashrc` -## Create conda environment +### Create conda environment - This is needed to install all the packages that are required for development: ```bash @@ -58,7 +58,7 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ server `research`, for your laptop) - You can reuse the same environment for multiple Git clients -## Check conda environment +### Check conda environment - Check that your conda environment exists: ```bash @@ -69,7 +69,7 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ develop * /home//.conda/envs/develop ``` -## Configure conda environment +### Configure conda environment - Every time you open a shell you need to activate the development environment run: @@ -83,9 +83,9 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ - Sets environment variables - Makes sure things are working properly -## Delete / recreate environment +### Delete / recreate environment -### Overwrite a conda environment with `create_conda.py` +#### Overwrite a conda environment with `create_conda.py` - You can use the option `--delete_env_if_exists` to overwrite a conda env, creating it from scratch @@ -100,7 +100,7 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ > create_conda.py -h ``` -### Manually delete a conda environment +#### Manually delete a conda environment - You can delete a conda environment by simply deleting the corresponding directory @@ -126,7 +126,7 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ - Note that `develop.OLD` might not work anymore since all the links are broken by the move -### To delete the entire conda installation (advanced users) +#### To delete the entire conda installation (advanced users) - This is a dangerous operation, since it deletes the executable `conda` - You want to do this only when your environment is screwed up: a more expert @@ -140,7 +140,7 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ - Run `rm -rf /anaconda3` - A good idea is to move it so you can resume the state -## Update anaconda +### Update anaconda - To update anaconda (i.e., the framework that manages conda packages and `conda` executable) @@ -163,7 +163,7 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ /Users/saggese/.conda/envs/amp_develop/bin/python ``` -## Configure user credentials +### Configure user credentials - For now this topic is obsolete. All development with AWS is running on a server side (or locally) in a docker container. Here you can find the @@ -175,7 +175,7 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ - Typically you can just copy-paste a portion of the configuration of another user -## Be patient +### Be patient - The `create_conda.py` flow is designed to make our projects portable across: - Platforms (e.g., macOS, Linux) @@ -193,7 +193,7 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ - If you hit a problem, be patient, ping GP / Paul, and we will extend the script to handle the quirks of your set-up -# Conda bloat +## Conda bloat - "Conda bloat" refers to the situation when there are more packages in the conda recipe than what strictly needed to allow us to make progress. @@ -211,7 +211,7 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ - On the one side, we want to minimize "conda bloat". - On the other side, we want to be able to experiment with packages. -## Minimize conda bloat +### Minimize conda bloat - To minimize conda bloat, our process consists of adding a package to the conda recipe when a new package is actually needed by code and to run unit tests @@ -221,9 +221,9 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ it should not be in the repo at all" - Thus a corollary is that all code in the repo should be tested -# Conda environment lifecycle +## Conda environment lifecycle -## Experimental conda environment +### Experimental conda environment - On the other side we want to be free to experiment with a package that can save us tons of development time. @@ -244,7 +244,7 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ - We can make this process more automated by generalizing the scripts we already have. -## Releasing a new conda environment +### Releasing a new conda environment - Once the new package is added to the official conda environment, we should: - Test the new conda environment locally, by creating a fresh environment and @@ -259,9 +259,9 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ - Typically GP takes care of getting all this fun stuff to work, but you are welcome to try locally to minimize surprises. -# Conda maintenance (only for admins) +## Conda maintenance (only for admins) -## Updating conda itself +### Updating conda itself - To update conda itself you can run: @@ -277,7 +277,7 @@ _THIS IS OBSOLETE AFTER DOCKER DEV CONTAINER_ 3.8.0 ``` -## Cleaning conda packages +### Cleaning conda packages - One can clean up the entire cache of packages with: diff --git a/docs/work_tools/all.development.how_to_guide.md b/docs/work_tools/all.development.how_to_guide.md index eba7bb739a..75504a1dff 100644 --- a/docs/work_tools/all.development.how_to_guide.md +++ b/docs/work_tools/all.development.how_to_guide.md @@ -1,4 +1,4 @@ - +# Development @@ -7,8 +7,7 @@ * [Check Git credentials](#check-git-credentials) * [Setting Git credentials](#setting-git-credentials) * [Enforcing Git credentials](#enforcing-git-credentials) -- [Create the env](#create-the-env) -- [Playback](#playback) +- [Create the thin env](#create-the-thin-env) - [Publish a notebook](#publish-a-notebook) * [Detailed instructions](#detailed-instructions) * [Publish notebooks](#publish-notebooks) @@ -36,9 +35,9 @@ -# Setting up Git credentials +## Setting up Git credentials -## Preamble +### Preamble - Git allows setting credentials at different "levels": - System (set for all the users in `/etc/git`) @@ -52,7 +51,7 @@ https://git-scm.com/book/en/v2/Customizing-Git-Git-Configuration - Details on `git config`: https://git-scm.com/docs/git-config -## Check Git credentials +### Check Git credentials - You can check the Git credentials that will be used to commit in a client by running: @@ -85,7 +84,7 @@ Update hooks ``` -## Setting Git credentials +### Setting Git credentials - To keep things simple and avoid variability, our convention is to use: - As `user.name` our Linux user name on the local computer we are using to @@ -108,7 +107,7 @@ - Note that you need to set these local values on each Git client that you have cloned, since Git doesn't version control these values -## Enforcing Git credentials +### Enforcing Git credentials - We use Git hooks to enforce that certain emails are used for certain repos (e.g., we should commit to our open-source repos only using our personal @@ -130,9 +129,10 @@ - You can also use the action `status` to see the status and `remove` to the hooks. -# Create the thin env +## Create the thin env - You can follow the + ```bash # Build the client env. > dev_scripts/client_setup/build.sh 2>&1 | tee tmp.build.log @@ -140,9 +140,10 @@ ``` - The installation is successful if you see at the end of the output + ```verbatim ... - # Installation + # Installation # Configure your client with: > source dev_scripts/setenv_amp.sh ``` @@ -159,7 +160,7 @@ ==> SUCCESS <== ``` -# Publish a notebook +## Publish a notebook - `publish_notebook.py` is a little tool that allows to: 1. Opening a notebook in your browser (useful for read-only mode) @@ -178,7 +179,7 @@ - One can take a snapshot and visually compare multiple notebooks side-by-side for changes -## Detailed instructions +### Detailed instructions - You can get details by running: @@ -189,7 +190,7 @@ - Plug-in for Chrome [my-s3-browser](https://chrome.google.com/webstore/detail/my-s3-browser/lgkbddebikceepncgppakonioaopmbkk?hl=en) -## Publish notebooks +### Publish notebooks - Make sure that your environment is set up properly @@ -244,15 +245,15 @@ --aws_profile am ``` -## Open a published notebook +### Open a published notebook -### Start a server +#### Start a server - `(cd /local/home/share/html/published_notebooks; python3 -m http.server 8000)` - Go to the page in the local browser -### Using the dev box +#### Using the dev box - To open a notebook saved on S3, \*outside\* a Docker container run: @@ -271,7 +272,7 @@ - And then navigate to the path (e.g., `/local/home/share/html/published_notebooks/Master_forecast_processor_reader.20220810-112328.html`) -### Using Windows browser +#### Using Windows browser - Another approach is: @@ -284,9 +285,9 @@ - For some reason, Chrome saves the link instead of opening, so you need to click on the saved link -# How to create a private fork +## How to create a private fork -- https://stackoverflow.com/questions/10065526/github-how-to-make-a-fork-of-public-repository-private +- Https://stackoverflow.com/questions/10065526/github-how-to-make-a-fork-of-public-repository-private - From https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/duplicating-a-repository @@ -298,19 +299,19 @@ - It worked only as cryptomtc, but not using my key -# Integrate public to private: `amp` -> `cmamp` +## Integrate public to private: `amp` -> `cmamp` -## Set-up +### Set-up ```bash > git remote add public git@github.com:alphamatic/amp -# Go to cmamp +## Go to cmamp > cd /data/saggese/src/cmamp1 > cd /Users/saggese/src/cmamp1 -# Add the remote -# git remote add public https://github.com/exampleuser/public-repo.git +## Add the remote +## git remote add public https://github.com/exampleuser/public-repo.git > git remote add public git@github.com:alphamatic/amp > git remote -v @@ -320,7 +321,7 @@ public git@github.com:alphamatic/amp (fetch) public git@github.com:alphamatic/amp(push) ``` -## Ours vs theirs +### Ours vs theirs - From https://stackoverflow.com/questions/25576415/what-is-the-precise-meaning-of-ours-and-theirs-in-git/25576672 @@ -332,28 +333,28 @@ public git@github.com:alphamatic/amp(push) - Ours = branch being rebased onto (e.g., master) - Theirs = branch being rebased (e.g., feature) -## Sync the repos (after double integration) +### Sync the repos (after double integration) ```bash > git fetch origin; git fetch public -# Pull from both repos +## Pull from both repos > git pull public master -X ours -# You might want to use `git pull -X theirs` or `ours` +## You might want to use `git pull -X theirs` or `ours` > git pull -X theirs > git pull public master -s recursive -X ours -# When there is a file added it is better to add +## When there is a file added it is better to add > git diff --name-status --diff-filter=U | awk '{print $2}' im/ccxt/db/test/test_ccxt_db_utils.py -# Merge branch +## Merge branch > gs + git status @@ -366,7 +367,7 @@ nothing to commit, working tree clean > git pull -X ours -## Make sure it's synced at ToT +### Make sure it's synced at ToT > rsync --delete -r /Users/saggese/src/cmamp2/ /Users/saggese/src/cmamp1 --exclude='.git/' @@ -374,36 +375,36 @@ nothing to commit, working tree clean > diff -r --brief /Users/saggese/src/cmamp1 /Users/saggese/src/cmamp2 | grep -v \.git ``` -## Updated sync +### Updated sync ```bash > git fetch origin; git fetch public ``` -## Check that things are fine +### Check that things are fine ```bash > git diff origin/master... >patch.txt > cd /Users/saggese/src/cmamp2 -# Create a branch +## Create a branch > git checkout -b Cmamp114_Integrate_amp_cmamp_20210928 > git apply patch.txt -# Compare branch with references +## Compare branch with references > dev_scripts/diff_to_vimdiff.py --dir1 /Users/saggese/src/cmamp1/im --dir2 /Users/saggese/src/cmamp2/im > diff -r --brief /Users/saggese/src/lemonade3/amp \~/src/cmamp2 | grep -v "/im" -# Creates a merge commit +## Creates a merge commit > git push origin master ``` -## Integrate private to public: `cmamp` -> `amp` +### Integrate private to public: `cmamp` -> `amp` ```bash > cd /data/saggese/src/cmamp1 @@ -421,7 +422,7 @@ nothing to commit, working tree clean > cmamp master -X ours ``` -## Squash commit of everything in the branch +### Squash commit of everything in the branch - From https://stackoverflow.com/questions/25356810/git-how-to-squash-all-commits-on-branch @@ -435,11 +436,11 @@ nothing to commit, working tree clean > git push --force ``` -# Double integration `cmamp` < -- > `amp` +## Double integration `cmamp` < -- > `amp` - The bug is https://github.com/alphamatic/amp/issues/1786 -## Script set-up +### Script set-up ```bash > vi /Users/saggese/src/amp1/dev_scripts/integrate_repos/setup.sh @@ -454,26 +455,26 @@ Update the date > source /Users/saggese/src/amp1/dev_scripts/integrate_repos/setup.sh ``` -## Manual set-up branches +### Manual set-up branches ```bash -# Go to cmamp1 +## Go to cmamp1 > go_amp.sh cmamp 1 -# Set up the env vars in both clients +## Set up the env vars in both clients > export AMP_DIR=/Users/saggese/src/amp1; export CMAMP_DIR=/Users/saggese/src/cmamp1; echo "$AMP_DIR"; ls $AMP_DIR; echo "$CMAMP_DIR"; ls $CMAMP_DIR -# Create two branches +## Create two branches > export BRANCH_NAME=AmpTask1786_Integrate_20211010 export BRANCH_NAME=AmpTask1786_Integrate_2021117 ... > cd $AMP_DIR -# Create automatically +## Create automatically > i git_create_branch -b $BRANCH_NAME -# Create manually +## Create manually > git checkout -b $BRANCH_NAME > git push --set-upstream origin $BRANCH_NAME @@ -481,7 +482,7 @@ $AMP_DIR; echo "$CMAMP_DIR"; ls $CMAMP_DIR > i git_create_branch -b $BRANCH_NAME ``` -## High-level plan +### High-level plan - SUBDIR=im - Typically `cmamp` is copied on top of `amp` @@ -490,30 +491,30 @@ $AMP_DIR; echo "$CMAMP_DIR"; ls $CMAMP_DIR - Everything else - Typically `amp` -> `cmamp` -## Sync `im` `cmamp` -> `amp` +### Sync `im` `cmamp` -> `amp` ```bash SUBDIR=im -# Check different files +## Check different files > diff -r --brief $AMP_DIR/$SUBDIR $CMAMP_DIR/$SUBDIR | grep -v .git -# Diff the entire dirs with vimdiff +## Diff the entire dirs with vimdiff > dev_scripts/diff_to_vimdiff.py --dir1 $AMP_DIR/$SUBDIR --dir2 $CMAMP_DIR/$SUBDIR -# Find different files +## Find different files > find $AMP_DIR/$SUBDIR -name "*"; find $CMAMP_DIR/$SUBDIR -name "*" sdiff /tmp/dir1 /tmp/dir2 -# Copy cmamp -> amp +## Copy cmamp -> amp > rsync --delete -au $CMAMP_DIR/$SUBDIR/ $AMP_DIR/$SUBDIR -a = archive -u = ignore newer -# Add all the untracked files +## Add all the untracked files > cd $AMP_DIR/$SUBDIR && git add $(git ls-files -o --exclude-standard) -# Check that there are no differences after copying +## Check that there are no differences after copying > dev_scripts/diff_to_vimdiff.py --dir1 $AMP_DIR/$SUBDIR --dir2 $CMAMP_DIR/$SUBDIR ========== @@ -559,13 +560,13 @@ cd+++++++ real_time/test/TestRealTimeReturnPipeline1.test1/output/ > f..t.... returns/test/TestReturnsBuilder.test_futures1/output/test.txt ``` -## Sync everything +### Sync everything ```bash -# Check if there is anything in cmamp more recent than amp +## Check if there is anything in cmamp more recent than amp > rsync -au --exclude='.git' --exclude='devops' $CMAMP_DIR/ $AMP_DIR -# vimdiff +## vimdiff > dev_scripts/diff_to_vimdiff.py --dir1 $AMP_DIR --dir2 $CMAMP_DIR @@ -573,20 +574,20 @@ F1: skip F9: choose left (i.e., amp) F10: choose right (i.e,. cmamp) -# Copy +## Copy > rsync -au --delete --exclude='.git' --exclude='devops' --exclude='im' $AMP_DIR/ $CMAMP_DIR -# Add all the untracked files +## Add all the untracked files > (cd $CMAMP_DIR/$SUBDIR && git add $(git ls-files -o --exclude-standard)) > diff -r --brief $AMP_DIR $CMAMP_DIR | grep -v .git | grep Only ``` -## Files that need to be different +### Files that need to be different - `amp` needs an `if False` `helpers/lib_tasks.py` @@ -606,7 +607,7 @@ TODO(gp): How to copy files in vimdiff including last line? > find . -name "\*.txt" | xargs perl -pi -e 'chomp if eof' ``` -### Testing +#### Testing - Run `amp` on my laptop (or on the server) - IN PROGRESS: Get `amp` PR to pass on GH diff --git a/docs/work_tools/all.dind_and_sibling_containers.how_to_guide.md b/docs/work_tools/all.dind_and_sibling_containers.how_to_guide.md new file mode 100644 index 0000000000..c813b7050c --- /dev/null +++ b/docs/work_tools/all.dind_and_sibling_containers.how_to_guide.md @@ -0,0 +1,100 @@ + + + + +- [Docker-in-docker (dind)](#docker-in-docker-dind) + * [Sibling container approach](#sibling-container-approach) + + [Connecting to Postgres instance using sibling containers](#connecting-to-postgres-instance-using-sibling-containers) + + + +# Docker-in-docker (dind) + +- It is possible to install a Docker engine inside a Docker container so that + one can run Docker container (e.g., OMS or IM) inside an isolated `amp` + container. +- The problems with this approach are: + - Dind requires to run the external container in privileged mode, which might + not be possible due to security concerns + - The Docker / build cache is not shared across parent and children + containers, so one needs to pull / build an image every time the outermost + container is restarted +- An alternative approach is the "sibling container" approach + +## Sibling container approach + +- Refs: + - [Can I run Docker-in-Docker without using the --privileged flag - Stack Overflow](https://stackoverflow.com/questions/29612463/can-i-run-docker-in-docker-without-using-the-privileged-flag) + - [https://jpetazzo.github.io/2015/09/03/do-not-use-docker-in-docker-for-ci/](https://jpetazzo.github.io/2015/09/03/do-not-use-docker-in-docker-for-ci/) +- Often what's really needed is the ability to build / run a container from + another container (e.g., CI or unit test). This can be achieved by mounting + the Docker socket `/var/run/docker.sock` to the container, so that a container + can talk to Docker Engine. +- This approach allows reuse of the build cache across the sibling containers. +- The downside is less isolation from the external container, e.g., spawned + containers can be left hanging or can collide. +- E.g., + ``` + # Run `docker ps` in a container, showing the containers running in the main + container + > docker run -ti --rm \ + -v /var/run/docker.sock:/var/run/docker.sock \ + dindtest \ + docker ps + + # Start a sibling hello world container: + > docker run -it --rm \ + -v /var/run/docker.sock:/var/run/docker.sock \ + dindtest \ + docker run -ti --rm hello-world + ``` + +### Connecting to Postgres instance using sibling containers + +- We can start the Docker container with Postgres as a service from outside the + container. + ``` + > (cd oms; i oms_docker_up -s local) + INFO: > cmd='/local/home/gsaggese/src/venv/amp.client_venv/bin/invoke oms_docker_up -s local' + report_memory_usage=False report_cpu_usage=False + docker-compose \ + --file /local/home/gsaggese/src/sasm-lime4/amp/oms/devops/compose/docker-compose.yml \ + --env-file /local/home/gsaggese/src/sasm-lime4/amp/oms/devops/env/local.oms_db_config.env \ + up \ + oms_postgres + Creating compose_oms_postgres_1 ... done + Attaching to compose_oms_postgres_1 + oms_postgres_1 | + oms_postgres_1 | PostgreSQL Database directory appears to contain a database; Skipping initialization + oms_postgres_1 | + oms_postgres_1 | 2022-05-19 22:57:15.659 UTC [1] LOG: starting PostgreSQL 13.5 (Debian 13.5-1.pgdg110+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit + oms_postgres_1 | 2022-05-19 22:57:15.659 UTC [1] LOG: listening on IPv4 address "0.0.0.0", port 5432 + oms_postgres_1 | 2022-05-19 22:57:15.659 UTC [1] LOG: listening on IPv6 address "::", port 5432 + oms_postgres_1 | 2022-05-19 22:57:15.663 UTC [1] LOG: listening on Unix socket "/var/run/postgresql/.s.PGSQL.5432" + oms_postgres_1 | 2022-05-19 22:57:15.670 UTC [25] LOG: database system was shut down at 2022-05-19 22:56:50 UTC + oms_postgres_1 | 2022-05-19 22:57:15.674 UTC [1] LOG: database system is ready to accept connections + ``` +- Note that Postgres needs to be +- Start a container able to +- From inside a container I launch postgres through the /var/... + ``` + > docker ps | grep postgres + CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES + 83bba0818c74 postgres:13 "docker-entrypoint.s..." 6 minutes ago Up 6 minutes + 0.0.0.0:5432->5432/tcp compose-oms_postgres-1 + ``` +- Test connection to the DB from outside the container + ``` + > psql --host=cf-spm-dev4 --port=5432 --user aljsdalsd -d oms_postgres_db_local + Password for user aljsdalsd: + psql (9.5.25, server 13.5 (Debian 13.5-1.pgdg110+1)) + WARNING: psql major version 9.5, server major version 13. + Some psql features might not work. + Type "help" for help. + oms_postgres_db_local=# + ``` +- Test connection to the DB from inside the container + ``` + > psql --host=cf-spm-dev4 --port=5432 --user aljsdalsd -d oms_postgres_db_local + ... + ``` diff --git a/docs/work_tools/all.docker.tutorial.md b/docs/work_tools/all.docker.tutorial.md new file mode 100644 index 0000000000..28229fb61e --- /dev/null +++ b/docs/work_tools/all.docker.tutorial.md @@ -0,0 +1,257 @@ + + + + +- [Docker](#docker) + * [Introduction](#introduction) + * [Concepts](#concepts) + + [Docker image](#docker-image) + + [Dockerfile](#dockerfile) + + [Docker container](#docker-container) + + [Docker registry](#docker-registry) + * [Poetry](#poetry) + * [Build a Docker image](#build-a-docker-image) + + [General](#general) + - [Base image](#base-image) + - [Copy files](#copy-files) + - [Install OS packages](#install-os-packages) + - [Install Python packages](#install-python-packages) + + [Build an image from a Dockerfile](#build-an-image-from-a-dockerfile) + * [Run multi-container Docker application](#run-multi-container-docker-application) + + [Version](#version) + + [Images](#images) + + [Bind mount](#bind-mount) + + [Environment variables](#environment-variables) + + [Basic commands](#basic-commands) + + + +# Docker + +## Introduction + +- Docker is an open-source tool designed to make our life typically easier + (although it takes energy and time to master) when creating, building, + deploying, and running software applications. +- Docker can package an application and its dependencies in a virtual container + that can run on any Linux, Windows, or macOS computer. +- Our Docker containers have everything required (e.g. OS packages, Python + packages) inside to run certain applications/code. + +## Concepts + +### Docker image + +- A Docker image is a read-only template with instructions for creating a Docker + container +- Typically a Docker image includes needed libraries and packages and their + versions + +### Dockerfile + +- A `Dockerfile` is a text document that contains all the commands to call on + the command line to assemble an image. E.g. + `//cmamp/devops/docker_build/dev.Dockerfile`. + +### Docker container + +- A Docker container is a runnable instance of an image. One can run code inside + a Docker container having all requirements installed. + +### Docker registry + +- A Docker registry stores Docker images. In other words, Docker registry for + docker images is like GitHub for code. + +## Poetry + +- Poetry is a tool for managing Python packages and dependencies and allows to: + - List packages you want to install with some constraints + - E.g., `pandas` must be above 1.0 in `devops/docker_build/pyproject.toml` + - Given a list of packages you need to install to get the desired environment, + `poetry` "optimizes" the package versions and generate + `devops/docker_build/poetry.lock`, which contains the list of versions of + the packages to install + - If there is a new version of a package re-running `poetry` might give you an + updated list of packages to install + +## Build a Docker image + +### General + +- A docker image is built from a `Dockerfile`. The image is then used to run a + Docker container. + + + +- There is `/devops` dir under a project's dir that contains Docker-related + files, e.g. `cmamp/devops` + +#### Base image + +- A `Dockerfile` should start with specifying a base image. +- The base image is an image that a new image is built on top of. A new Docker + image will have all the packages/dependencies that are installed in the base + image. +- Use `FROM` statement to specify a base image, e.g. + ``` + FROM ubuntu:20.4 + ``` + +#### Copy files + +- Copy files that are required to build a Docker image to the Docker filesystem +- To copy a file from `/source_dir` (your filesystem) to `/dst_dir` (Docker + filesystem) do: + ``` + COPY source_dir/file dst_dir + ``` +- E.g., the command below will copy `install_packages.sh` from + `devops/docker_build` to the Docker's root directory so that + `install_packages.sh` can be accessed by Docker + ``` + COPY devops/docker_build/install_packages.sh . + ``` + +#### Install OS packages + +- Install OS packages that are needed for a Docker app, but that are not + installed for a base image +- Use `RUN` instruction to install a package, e.g. + ``` + RUN apt-get install postgresql-client + ``` +- Alternatively you can package all installation instructions in a `.sh` file + and run it. Do not forget to copy a `.sh` file to the Docker filesystem so + that Docker can see it. E.g., + ``` + COPY devops/docker_build/install_packages.sh . + RUN /bin/sh -c "./install_packages.sh" + ``` + +#### Install Python packages + +- We prefer to install Python packages with `poetry` +- Make sure that there is instruction to install `pip3` and `poetry`. You can + either put it in a `Dockerfile` or in a separate file like + `install_packages.sh`. + ``` + RUN apt-get install python3-pip + RUN pip3 install poetry + ``` +- Copy poetry-related files to the Docker filesystem so that files can be + accessed by Docker + ``` + COPY devops/docker_build/poetry.toml + COPY devops/docker_build/poetry.lock + ``` +- Install Python packages + ``` + RUN poetry install + ``` + +### Build an image from a Dockerfile + +- To build an image from a `Dockerfile` run: + ``` + > docker build . + ``` +- The `Dockerfile` must be called `Dockerfile` and located in the root of the + build context +- You can point to any `Dockerfile` by using `-f`: + ``` + > docker build -f /path/to/dockerfile + ``` + +## Run multi-container Docker application + +- Docker Compose is a tool for defining and running multi-container Docker + applications +- With Docker Compose you use a `YAML` file to configure your application's + services + + + +### Version + +- At the beginning of a `docker-compose.yaml` file specify the `docker-compose` + version. For more information see + [the official documents](https://docs.docker.com/compose/compose-file/compose-versioning/) + ``` + version: "3.0" + ``` + +### Images + +- You can either re-use a public image or build a new one from a `Dockerfile` +- The `app` service below uses the image that is built from the `dev.Dockerfile` + ``` + app: + build: + context: . + dockerfile: dev.Dockerfile + ``` +- The `im_postgres_local` service below uses the public `postgres` image pulled + from the [Docker hub registry](https://hub.docker.com/_/postgres) + ``` + im_postgres_local: + image: postgres: 13 + ``` + +### Bind mount + +- If you want to be able to share files between the host and a Docker container, + you should bind-mount a directory +- E.g. mount current directory to `/app` dir inside a Docker container: + ``` + app: + volumes: + - .:/app + ``` + +### Environment variables + +- You can either use variables directly from the environment or pass them in a + `docker-compose.yaml` file +- It is supposed that `POSTGRES_VERSION` is already defined in the shell. + ``` + db: + image: "postgres:${POSTGRES_VERSION}" + ``` +- Set environment variable in a service's container + ``` + db: + environment: + - POSTGRES_VERSION=13 + image: "postgres:${POSTGRES_VERSION}" + ``` +- Set environment variable with `.env` file + ``` + db: + env_file: + - ./postgres_env.env + image: "postgres:${POSTGRES_VERSION}" + ``` +- File `postgres_env.env` + ```bash + > cat ./postgres_env.env + POSTGRES_VERSION=13 + ``` + +### Basic commands + +- To check more advanced usage, please see + [the official documentation](https://docs.docker.com/compose/reference/). + +- Let's assume that the `docker-compose.yaml` file is located in the current dir + + ```bash + # Build, (re)create, start, and attach to containers for a service. + > docker-compose up + + # List containers + > docker-compose ps + + # Stop containers created with `up` + > docker-compose down + ``` diff --git a/docs/work_tools/all.docker_dev_tools_container.how_to_guide.md b/docs/work_tools/all.docker_dev_tools_container.how_to_guide.md new file mode 100644 index 0000000000..ed35fb54f2 --- /dev/null +++ b/docs/work_tools/all.docker_dev_tools_container.how_to_guide.md @@ -0,0 +1,79 @@ + + + + +- [Dev tools container](#dev-tools-container) + * [dev_tools](#dev_tools) + + + +# Dev tools container + +## dev_tools + +- File an Issue for the release +- Create the corresponding branch in dev_tools +- Change the code +- Run the release flow end-to-end + ``` + > i docker_release_dev_image --version 1.1.0 + > i docker_release_prod_image --version 1.1.0 + ``` + TODO(Vlad): Add a command to run the push to Dockerhub and add it to the + single arch release flow +- Push the image to Dockerhub manually + - Login to Dockerhub with the `sorrentum` account + ``` + > docker login --username=sorrentum + ``` + - Tag the dev version image as `sorrentum/dev_tools:dev` + ``` + > docker tag 665840871993.dkr.ecr.us-east-1.amazonaws.com/dev_tools:dev-1.1.0 sorrentum/dev_tools:dev + ``` + - Push the dev image to Dockerhub + ``` + > docker push sorrentum/dev_tools:dev + ``` + - Tag the prod version image as `sorrentum/dev_tools:prod` + ``` + > docker tag 665840871993.dkr.ecr.us-east-1.amazonaws.com/dev_tools:prod sorrentum/dev_tools:prod + ``` + - Push the prod image to Dockerhub + ``` + > docker push sorrentum/dev_tools:prod + ``` +- Push the latest `prod` image to GHCR registry manually for GH actions to use + it + - Perform a Docker login using your GitHub username and PAT (Personal Access + Token): + ```bash + > docker login ghcr.io -u + ``` + - Tag the `prod` image to the GHCR namespace: + ```bash + > docker tag 623860924167.dkr.ecr.eu-north-1.amazonaws.com/dev_tools:prod ghcr.io/cryptokaizen/dev_tools:prod + ``` + - Push the tagged image to the GHCR registry: + ```bash + > docker push ghcr.io/cryptokaizen/dev_tools:prod + ``` + +- Update the changelog, i.e. `//dev_tools/changelog.txt` + - The changelog should be updated only after the image is released; otherwise + the sanity checks will assert that the release's version is not higher than + the latest version recorded in the changelog. + - Specify what has changed + - Pick the release version accordingly + - NB! The release version should consist of 3 digits, e.g. "1.1.0" instead + of "1.1" + - We use [semantic versioning](https://semver.org/) convention + - For example, adding a package to the image would mean bumping up version + 1.0.0 to 1.0.1 +- Do a PR with the change including the updated `changelog.txt` +- Send a message on the `all@` chat telling people that a new version of the + container has been released + - Users need to do + - `i docker_pull` from `dev_tools`, + - `i docker_pull_dev_tools` from `cmamp` + - Users need to make sure to pull docker after the master is up-to-date + (including amp submodules) diff --git a/docs/work_tools/all.docker_optimizer_container.how_to_guide.md b/docs/work_tools/all.docker_optimizer_container.how_to_guide.md new file mode 100644 index 0000000000..f599945978 --- /dev/null +++ b/docs/work_tools/all.docker_optimizer_container.how_to_guide.md @@ -0,0 +1,346 @@ + + + + +- [Optimizer container](#optimizer-container) + * [Rationale](#rationale) + * [Build and run a local version of `opt`](#build-and-run-a-local-version-of-opt) + * [Internals](#internals) + + [One container per Git repo](#one-container-per-git-repo) + + [Multiple containers per Git repo](#multiple-containers-per-git-repo) + - [Mounting only `optimizer` dir inside Docker](#mounting-only-optimizer-dir-inside-docker) + - [Mounting the supermodule (e.g., lime, lemonade, amp) inside Docker](#mounting-the-supermodule-eg-lime-lemonade-amp-inside-docker) + * [Invariants](#invariants) + * [Release and ECR flow](#release-and-ecr-flow) + * [Unit testing code inside `opt` container](#unit-testing-code-inside-opt-container) + + [Avoid compiling code depending from cvxopt when running amp](#avoid-compiling-code-depending-from-cvxopt-when-running-amp) + + [Run optimizer tests in a stand-alone `opt` container](#run-optimizer-tests-in-a-stand-alone-opt-container) + + [Run optimizer tests as part of running unit tests for `cmamp`](#run-optimizer-tests-as-part-of-running-unit-tests-for-cmamp) + * [Call a Dockerized executable from a container](#call-a-dockerized-executable-from-a-container) + + + +# Optimizer container + +## Rationale + +- The high-level goal is to move towards containerized Python scripts running in + smaller containers instead of keep adding packages to `amp` / `cmamp`, which + makes the `amp` / `cmamp` container bloated and risky to build +- Along this design philosophy similar to microservices, we want to have a + Docker container, called `opt` with a Python script that uses some packages + that are not compatible with `amp` (specifically cvxopt, cvxpy) +- This is similar to what we do for the `dev_tools`, which is like a + containerized Python script for the linter + +## Build and run a local version of `opt` + +- You can build the container locally with: + ``` + > cd optimizer + > i opt_docker_build_local_image --version 0.1.0 + ``` +- This process takes around 5 mins and then you should have the container + ``` + docker image ls 665840871993.dkr.ecr.us-east-1.amazonaws.com/opt:local-saggese-0.1.0 + REPOSITORY TAG IMAGE ID CREATED SIZE + 665840871993.dkr.ecr.us-east-1.amazonaws.com/opt local-saggese-0.1.0 bb7d60d6a7d0 7 seconds ago 1.23GB + ``` +- Run the container as: + ``` + > i opt_docker_bash --stage local --version 0.1.0 + ``` +- To run a Jupyter notebook in the `opt` container: + +## Internals + +### One container per Git repo + +- A simple approach is to have each deployable unit (i.e., container) + corresponding to a Git repo + - The consequence would be: + - A multiplication of repos + - No implicit sharing of code across different containers + - Some mechanism to share code (e.g., `helpers`) across repos (e.g., using + bind mount) + - Not playing nice with Git subrepo mechanism since Docker needs to see the + entire repo + +- So the code would be organized in 4 repos: + ``` + - lemonade / lime + - helpers + - optimizer + - oms + - models in amp + ``` + - Where the dependency between containers are + - Lemonade -> amp + - Amp -> optimizer, helpers + - Optimizer -> helpers, core + +### Multiple containers per Git repo + +- Another approach is to have `optimizer` as a directory inside `amp` + - This keeps `amp` and `optimizer` in a single repo + - To build / run optimizer code in its container one needs to `cd` in the dir + - The problem then becomes how to share `helpers` + +#### Mounting only `optimizer` dir inside Docker + +- From `devops/compose/docker-compose.yml` + ``` + 42 volumes: + 43 # Move one dir up to include the entire git repo (see AmpTask1017). + 44 - ../../:/app + 45 # Move one dir down to include the entire git repo (see AmpTask1017). + 46 working_dir: /app + ``` +- From `devops/docker_build/dev.Dockerfile` +- ENTRYPOINT ["devops/docker_run/entrypoint.sh"] +- The problem is that Git repo doesn't work anymore + ``` + git --version: git version 2.30.2 + fatal: not a git repository (or any parent up to mount point /) + Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set). + ``` +- A work around is to inject .git in /git of the container and then point git to + that + ``` + environment: + ... + - GIT_DIR=/git + + volumes: + # Move one dir up to include the entire git repo (see AmpTask1017). + - ../../:/app + - ../../../../.git:/git + - ../../../../amp/helpers:/app/helpers + ``` + +- Git works but it gets confused with the paths + ``` + modified: .dockerignore + deleted: .github/gh_requirements.txt + deleted: .github/workflows/build_image.yml.DISABLED + deleted: .github/workflows/fast_tests.yml + deleted: .github/workflows/linter.yml.DISABLED + deleted: .github/workflows/slow_tests.yml + deleted: .github/workflows/superslow_tests.yml.DISABLED + deleted: .gitignore + ``` + +#### Mounting the supermodule (e.g., lime, lemonade, amp) inside Docker + +- From `devops/compose/docker-compose.yml` + ``` + 42 volumes: + 43 # Move one dir up to include the entire git repo (see AmpTask1017). + 44 - ../../../:/app + 45 # Move one dir down to include the entire git repo (see AmpTask1017). + 46 working_dir: /app/amp + ``` +- From `devops/docker_build/dev.Dockerfile` +- ENTRYPOINT ["optimizer/devops/docker_run/entrypoint.sh"] +- This approach mounts 4 dirs up from devops/compose/docker-compose.yml, i.e., + //lime +- The problem with this approach is that now repo_config.py is incorrect +- `i opt_docker_build_local_image --version 0.4.0` + ``` + 32 - ../../../helpers:/app/amp/optimizer/helpers + 33 + 34 # Shared cache. This is specific of lime. + 35 - /local/home/share/cache:/cache + 36 + 37 # Mount `amp` when it is used as submodule. In this case we need to + 38 # mount the super project in the container (to make git work with the + 39 # supermodule) and then change dir to `amp`. + 40 app: + 41 extends: + 42 base_app + 43 volumes: + 44 # Move one dir up to include the entire git repo (see AmpTask1017). + 45 - ../../../../:/app + 46 # Move one dir down to include the entire git repo (see AmpTask1017). + 47 working_dir: /app/amp/optimizer + 48 #entrypoint: /bin/bash -c "ls helpers" + ``` + +## Invariants + +- A deployable dir is a dir under a Git repo + - It corresponds to a software component (code + library = Docker container) + - Anything that has a devops dir is "deployable" +- Each Docker container is run from its corresponding dir, e.g., + - Amp container from the amp dir + - Amp container from the lemonade dir (this is just a shortcut since lemonade + has the same deps right now as amp) +- Always mount the outermost Git repo under `/app` +- Set the Docker working dir as the current dir +- Each deployable dir specifies all the needed information in `repo_config.py` + (which is the one in the current dir) + - What container to run + - What functionality is supported on different servers (e.g., privileged way) +- The `changelog.txt` file is in the deployable dir (e.g., + optimizer/changelog.txt) +- Each + +One run the invoke commands from optimizer dir + +When the Docker container starts the current dir is optimizer + +helpers, core is mounted in the same dir + +You can't see code outside optimizer + +TODO(gp): running in amp under lemonade should use the local repo_config + +## Release and ECR flow + +TODO(gp): Implement this + +## Unit testing code inside `opt` container + +- Since we want to segregate the package dependencies in different containers, + tests that have a dependency from cvxopt /cvxpy can't be run inside the `amp` + container but need to be run inside `opt`. +- We want to: + 1. (as always) write and run unit tests for the optimizer code in isolation, + i.e., test the code in the directory `optimizer` by itself + 2. Run all the tests for the entire repo (relying on both containers `amp` and + `optimizer` with a single command invocation) + 3. Be able to run tests belonging to only one of the containers to shorten the + debugging cycle +- To achieve this we need to solve the 3 problems below. + +### Avoid compiling code depending from cvxopt when running amp + +- We can't parse code (e.g., in `pytest`) that includes packages that are not + present in a container + - E.g., `pytest` running in `amp` should not parse code in `//amp/optimizer` + since it contains imports that will fail + +- **Solution 1** + - We use the pytest mechanism `cvx = pytest.importorskip("cvxpy")` which is + conceptually equivalent to: + ``` + try: + import cvxopt + has_cvxopt = True + except ImportError: + has_cvxopt = False + + if has_cvxopt: + def utils1(): + cvxopt… + ``` + +- **Solution 2** + - Test in eachfile for the existence of the needed packages and enclose the + code in an `if _has_package` + - Pros: + - We can skip code based dynamically on a `try ... except ImportModule` to + check what packages are present + - Cons: + - Repeat the same piece of `try ... except` in many places + - Solution: we can factor it out in a function + - We need to enclose the code in a `if ...` that screws up the indentation + and makes the code weird + +- **Solution 3** + - Exclude certain directories (e.g., `//amp/optimizer`) from `pytest` + - Pros: + - We don't have to spread the `try ... except` and `if \_has_package` in + the code + - Cons: + - The directory is relative to the top directory + - Solution: we can use a regex to specify the dir without the full path + - Which directories are included and excluded depends on where `pytest` is + run + - E.g., running `pytest` in an `amp` container we need to skip the + `optimizer` dir, while `pytest` in an `optimizer` container should + skip everything but the `optimizer` dir + +- **Solution 4** + - Exclude certain directories or files based on which container we are running + in + - Cons: + - We need to have a way to determine in which container we are running + - Solution: we can use the env vars we use for versioning + ``` + > echo $AM_CONTAINER_VERSION + amp-1.0.3- + ``` +- Given the pros and cons, we decided to follow Solution 1 and Solution 3 + +### Run optimizer tests in a stand-alone `opt` container + +- To run the optimizer tests, you can create an `opt ` container and then run + `pytest` + ``` + > cd optimizer + > i opt_docker_bash + docker> pytest . + ``` +- We wrap this in an invoke target like `i opt_run_fast_tests` + +**Alternative solution** + +- We can use dind to run the `opt` container inside a `cmamp` one + - Cons: + - Dind complicates the system + - Dind is not supported everywhere (one needs privileged containers) + - Dind is slower since there are 2 levels of (relatively fast) + virtualization + +### Run optimizer tests as part of running unit tests for `cmamp` + +- We use the same mechanism as `run_fast_slow_superslow_tests` to pull together + different test lists + +## Call a Dockerized executable from a container + +- From + [https://github.com/cryptokaizen/cmamp/issues/1357](https://github.com/cryptokaizen/cmamp/issues/1357) +- We need to call something from `amp` to `opt` Docker + +- **Solution 1** + - Inside the code we build the command line + `cmd = 'docker run -it ... '; system(cmd)` + - Cons: + - There is code replicated between here and the invoke task (e.g., the + info about the container, ...) + +- **Solution 2** + - Call the Dockerized executable using the `docker_cmd` invoke target + ``` + cmd = "invoke opt_docker_cmd -cmd '...'" + system(cmd) + ``` + - Pros: + - All the Docker commands go through the same interface inside invoke + - Cons + - Bash interpolation in the command + - Another level of indirection: do a system call to call `invoke`, + `invoke` calls docker, docker does the work + - `invoke` needs to be installed inside the calling container + +- **Solution 3** + - Call opt_lib_tasks.py `opt_docker_cmd(cmd, ...)` + - Pros + - Avoid doing a call to invoke + - Can deal with bash interpolation in Python + +- We should always use Solution 3, although in the code sometimes we use + Solution 1 and 2 (but we should replace in favor of Solution 3). + +## + +- The interface to the Dockerized optimizer is in `run_optimizer` in + `//amp/oms/call_optimizer.py` +- To run the examples + ``` + > cd //lime + > i docker_bash + > pytest ./amp/oms/test/test_call_optimizer.py::Test_run_dockerized_optimizer1 + ``` diff --git a/docs/work_tools/all.dockerhub.how_to_guide.md b/docs/work_tools/all.dockerhub.how_to_guide.md index def5411629..972b0e0b66 100644 --- a/docs/work_tools/all.dockerhub.how_to_guide.md +++ b/docs/work_tools/all.dockerhub.how_to_guide.md @@ -1,28 +1,42 @@ -# Login Dockerhub + + + + +- [Dockerhub](#dockerhub) + * [Login Dockerhub](#login-dockerhub) + * [Login through CLI](#login-through-cli) + * [List all the images](#list-all-the-images) + * [Rename an image](#rename-an-image) + + + +# Dockerhub + +## Login Dockerhub https://hub.docker.com/ -Username: sorrentum -Email: gp@crypto-kaizen.com +Username: sorrentum Email: gp@crypto-kaizen.com There are several public images -- sorrentum/cmamp -- sorrentum/dev_tools +- Sorrentum/cmamp +- Sorrentum/dev_tools Used in DATA605: -- sorrentum/sorrentum -- sorrentum/defi -- sorrentum/jupyter + +- Sorrentum/sorrentum +- Sorrentum/defi +- Sorrentum/jupyter The page corresponding to the Sorrentum repo is https://hub.docker.com/u/sorrentum -# Login through CLI +## Login through CLI > docker login --username sorrentum --password XYZ -# List all the images +## List all the images - Without authentication ``` @@ -34,10 +48,10 @@ https://hub.docker.com/u/sorrentum "defi" ``` -# Rename an image +## Rename an image -> docker pull yourusername/oldimagename:tag -> docker tag yourusername/oldimagename:tag yourusername/newimagename:tag -> docker push yourusername/newimagename:tag +> docker pull yourusername/oldimagename:tag docker tag +> yourusername/oldimagename:tag yourusername/newimagename:tag docker push +> yourusername/newimagename:tag - To delete the old image you need to go through the GUI diff --git a/docs/work_tools/all.gh_and_thin_env_requirements.reference.md b/docs/work_tools/all.gh_and_thin_env_requirements.reference.md new file mode 100644 index 0000000000..410353be51 --- /dev/null +++ b/docs/work_tools/all.gh_and_thin_env_requirements.reference.md @@ -0,0 +1,79 @@ +# Required Packages for the thin environment and GH Actions + + + +- [Thin environment](#thin-environment) + * [Packages](#packages) + * [Candidate Packages to remove](#candidate-packages-to-remove) +- [GH Actions](#gh-actions) + * [Packages](#packages-1) + * [Candidate Packages to remove](#candidate-packages-to-remove-1) + + + +## Thin environment + +File location: + +- [requirements.txt](https://github.com/cryptokaizen/cmamp/blob/master/dev_scripts/client_setup/requirements.txt) + +### Packages + +- `boto3` + - Interacts with the AWS services: + - [`boto3` import in the `haws`](https://github.com/cryptokaizen/cmamp/blob/master/helpers/haws.py#L10) + - [`haws` usage in the `lib_tasks_docker_release.py`](https://github.com/cryptokaizen/cmamp/blob/master/helpers/lib_tasks_docker_release.py#L862) + +- `invoke` + - Need for running the invoke targets: + - [\_run_tests](https://github.com/cryptokaizen/cmamp/blob/master/helpers/lib_tasks_pytest.py#L299) + +- `poetry` + - Manage dependencies in the dev image: + - [docker_build_local_image](https://github.com/cryptokaizen/cmamp/blob/master/helpers/lib_tasks_docker_release.py#L119) + +- `pytest` + - To run `Docker image QA tests`: + - [\_run_qa_tests](https://github.com/cryptokaizen/cmamp/blob/master/helpers/lib_tasks_docker_release.py#L119) + +- `tqdm` + - Widely used for showing the progress of the process for example: + - [\_fix_invalid_owner](https://github.com/cryptokaizen/cmamp/blob/master/helpers/lib_tasks_perms.py#L243) + +- `s3fs` + - Needed for some invoke targets, for example: + - [docker_update_prod_task_definition](https://github.com/cryptokaizen/cmamp/blob/CmampTask6520_gDoc_for_required_packages_in_github_workflow_and_thin_env/helpers/lib_tasks_docker_release.py#L866) + +- `requests` + - Dependency for the `docker`, for now pinned to the version `2.31.0` since + the versions >=`2.32.1` is causing the issue with the `docker-compose`: + https://github.com/psf/requests/issues/6707 + - See the https://github.com/cryptokaizen/cmamp/issues/8340 for details + +### Candidate Packages to remove + +- `docker` and `docker-compose` should be moved to OS installation + https://github.com/cryptokaizen/cmamp/issues/6498 + +## GH Actions + +File location: + +- [gh_requirements.txt](https://github.com/cryptokaizen/cmamp/blob/master/.github/gh_requirements.txt) + +### Packages + +- `invoke` +- `poetry` +- `pytest` +- `tqdm` +- `s3fs` +- `requests` + +For above packages, see descriptions in the +[Thin environment/Packages](#packages) section. + +### Candidate Packages to remove + +- `docker` and `docker-compose` see in the + [Thin environment](#candidate-packages-to-remove) section diff --git a/docs/work_tools/all.gh_thin_env_dependencies.how_to_guide.md b/docs/work_tools/all.gh_thin_env_dependencies.how_to_guide.md new file mode 100644 index 0000000000..9f74a63756 --- /dev/null +++ b/docs/work_tools/all.gh_thin_env_dependencies.how_to_guide.md @@ -0,0 +1,98 @@ +# Thin environment dependencies + + + +- [Description](#description) +- [Change in requirements file](#change-in-requirements-file) +- [Confirm with Build team](#confirm-with-build-team) +- [Update requirements file](#update-requirements-file) +- [Update Documentation](#update-documentation) +- [Notify Team](#notify-team) + + + +## Description + +- We have 3 sources of package requirements in the project: + + 1. The thin environment to run `invoke` targets outside the container + - [/dev_scripts/client_setup/requirements.txt](/dev_scripts/client_setup/requirements.txt) + - This is managed with `pip` + 2. GitHub requirements used for GitHub Actions specifically + - [/.github/gh_requirements.txt](/.github/gh_requirements.txt) + - This is managed with `pip` + 3. Requirements necessary for the container: + - [/devops/docker_build/pyproject.toml](/devops/docker_build/pyproject.toml) + - This is managed with `poetry` + +- We want to keep the thin environment as "thin" as possible (i.e., with fewer + dependencies) +- The thin environment and GitHub requirements have to be in sync + - The only difference is that the GitHub requirements have some limitations + due to the GitHub Actions environment + - TODO(Vlad): Still not clear what exact difference between the two + requirements files + +- This document provides a step-by-step guide for adding or make any changes in + the requirements file of both the thin env and GitHub + +## Change in requirements file + +- Some reasons for updating/changing the `requirements.txt` file are: + - A new feature requires a new package outside the container, e.g., a new or + updated `invoke` target + - Upgrading the package version since the current one is outdated + - Removing a package since it is not used anymore + +## Confirm with Build team + +- Changes in any of the requirement files should be confirmed with the Build + team before merging the PR + - Is the new dependencies really needed? + - If the new dependencies is really needed, can we limit the scope of the + dependency? E.g., + - Move the related imports to where it is strictly needed in the code + - Do a try-catch `ImportError` + +Example: + +- The [/helpers/lib_tasks_gh.py](/helpers/lib_tasks_gh.py) module has some + `invoke` targets that are executed only in the container +- If the new package is needed for the `invoke` target only in the container, we + should move the import to the function where it is strictly needed +- See the `gh_publish_buildmeister_dashboard_to_s3()` in the + [/helpers/lib_tasks_gh.py](https://github.com/cryptokaizen/cmamp/blob/master/helpers/lib_tasks_gh.py#L469) + for reference. + +## Update requirements file + +- Update both the requirements file if relevant + [/dev_scripts/client_setup/requirements.txt](/dev_scripts/client_setup/requirements.txt) + and [/.github/gh_requirements.txt](/.github/gh_requirements.txt) + - This file should be changed in every repository (e.g., `cmamp`, + `kaizenflow`, `orange`) +- After adding the new requirements the build team will run all the tests + locally as well as on GitHub + +## Update Documentation + +- Update the + [/docs/dev_tools/thin_env/all.gh_and_thin_env_requirements.reference.md](/docs/dev_tools/thin_env/all.gh_and_thin_env_requirements.reference.md) + +## Notify Team + +In the @all Telegram channel, notify the team about the new package and ask them +to rebuild the thin env. + +Example: +``` +Hi! In the PR: https://github.com/cryptokaizen/cmamp/pull/6800 we removed +unused packages from the thin environment. + +You need to update the thin environment by running: + +> cd ~/src/cmamp1 +> dev_scripts/client_setup/build.sh +``` + +Last review: GP on 2024-05-07 diff --git a/docs/work_tools/all.git.how_to_guide.md b/docs/work_tools/all.git.how_to_guide.md index 1a3e349b7d..2280e6ea15 100644 --- a/docs/work_tools/all.git.how_to_guide.md +++ b/docs/work_tools/all.git.how_to_guide.md @@ -1,4 +1,6 @@ -# Git workflow and best practices +# Git + +## Git workflow and best practices @@ -44,7 +46,7 @@ -# Before you start +## Before you start - GitHub is the place where we keep our code - `git` is the tool (program) for version control @@ -55,7 +57,7 @@ - More details about what is public key you can find in [all.ssh.how_to_guide.md](https://github.com/cryptokaizen/cmamp/blob/master/docs/work_tools/all.ssh.how_to_guide.md) -## Readings +### Readings - Read at least the first 3 chapters of [Git book](https://git-scm.com/book/en/v2) @@ -63,7 +65,7 @@ [Git Submodules](https://git-scm.com/book/en/v2/Git-Tools-Submodules) - We use Git submodules to compose and share code about repos -# Workflow +## Workflow - Run `git fetch` ``` @@ -196,9 +198,9 @@ - Follow up on all comments and mark as resolved any requested changes that you resolve -# Best Practices +## Best Practices -## Do not check in large data files +### Do not check in large data files - Avoid checking in large data files - The reason is that large files bloat the repo @@ -208,13 +210,13 @@ - Sometimes is makes sense to check in some representative data for unit tests - BUT, larger tests should obtain their data from s3 or MongoDB -## Branch workflow best practices +### Branch workflow best practices -### Branches are cheap +#### Branches are cheap - One of the advantages of working with Git is that branches are cheap -### `master` is sacred +#### `master` is sacred - In an ideal world `master` branch is sacred (see Platinum rule of Git) - Development should never be done directly on master @@ -224,7 +226,7 @@ - `master` should be always never broken (all tests are passing and it is deployable) -### Always work in a branch +#### Always work in a branch - Generally it is best to be the sole contributor to your branch - If you need to collaborate with somebody on a branch, remember that the @@ -247,7 +249,7 @@ changes outside of the notebook (e.g., hacks to get the notebook working that need to be cleaned up) -### Keep different changes in separate branches +#### Keep different changes in separate branches - It is easier for you to keep work sane and separated - Cons of multiple conceptual changes in the same branches @@ -257,7 +259,7 @@ - Packaging unrelated changes together that means no change gets merged until all of the changes are accepted -## Pull request (PR) best practices +### Pull request (PR) best practices - Make sure your PR is coherent - It may not need to do everything the Task requires, but the PR should be @@ -278,9 +280,9 @@ don't need to write any code, just do <this_and_that>" - Merged changes are tested in the Jenkins build -## Workflow diagram +### Workflow diagram -## Deleting a branch +### Deleting a branch - You can run the script `dev_scripts/git/git_branch.sh` to get all the branches together with some information, e.g., last commit and creator @@ -332,9 +334,9 @@ > git push origin --delete PTask354_INFRA_Populate_S3_bucket ``` -# How-to and troubleshooting +## How-to and troubleshooting -## Do not mess up your branch +### Do not mess up your branch - If you are working in a branch, before doing `git push` make sure the branch is not broken (e.g., from a mistake in merge / rebase mess) @@ -361,9 +363,9 @@ - If you see that there is a problem, don't push upstream (because the branch will be broken for everybody) and ask a Git expert -## Analyzing commits +### Analyzing commits -### Show files modified in a commit +#### Show files modified in a commit - You can see the files modified in a given commit hash with: ``` @@ -383,9 +385,9 @@ vendors/first_rate/utils.py ``` -## Conflicts +### Conflicts -### Getting the conflicting files +#### Getting the conflicting files - To see the files in conflicts ``` @@ -393,7 +395,7 @@ ``` - This is what the script `git_conflict_files.sh` does -### Accepting "theirs" +#### Accepting "theirs" ``` > git checkout --theirs $FILES > git add $FILES @@ -410,7 +412,7 @@ - Stage #1 is the common ancestor of the files, stage #2 is the target-branch version, and stage #3 is the version you are merging from. -## How to get out of a messy/un-mergeable branch +### How to get out of a messy/un-mergeable branch - If one screws up a branch: - Rebase to master @@ -446,16 +448,16 @@ ... ``` -## Reverting +### Reverting -### Reverting the last local commit +#### Reverting the last local commit ``` > git reset --soft HEAD~ ``` -## Branching +### Branching -### Checking what work has been done in a branch +#### Checking what work has been done in a branch - Look at all the branches available: ``` @@ -489,7 +491,7 @@ > gd a637594..eb12233 ``` -### Checking if you need to merge `master` into your feature branch +#### Checking if you need to merge `master` into your feature branch - You can see what commits are in master but missing in your branch with: ``` @@ -499,7 +501,7 @@ ``` - You want to `rebase` your feature branch onto `master` -### Comparing the difference of a directory among branches +#### Comparing the difference of a directory among branches - This is useful if we want to focus on changes on a single dir ``` @@ -521,7 +523,7 @@ vendors/test/test_vendors.py ``` -## Merging `master` +### Merging `master` - If your branch lives long, you want to apply changes made on master to show on your branch @@ -549,7 +551,7 @@ change in one shot like we would do for a merge commit, but you need to revert all the inlined changes -## Rebasing +### Rebasing - **For now, we suggest avoiding the rebase flow** - The reason is that rebase makes things cleaner when used properly, but can get @@ -569,7 +571,7 @@ # You can see that you are ahead of master ``` -## Merging pull requests +### Merging pull requests - The procedure for manual merges is as follows - **Do not merge yourself unless explicitly requested by a reviewer** @@ -591,14 +593,14 @@ > git branch -d my_feature ``` -# Submodules +## Submodules -## Adding a submodule +### Adding a submodule - Following the instructions in [https://git-scm.com/book/en/v2/Git-Tools-Submodules](https://git-scm.com/book/en/v2/Git-Tools-Submodules) -## Working in a submodule +### Working in a submodule - When you work in a submodule, the flow should be like: - Create a branch in a submodule @@ -606,7 +608,7 @@ - Push the submodule branch - Create a PR in the submodule when you are done -## Updating a submodule to the latest commit +### Updating a submodule to the latest commit - After the submodule PR is merged: - Checkout the submodule in the master branch and do `git pull` @@ -615,26 +617,26 @@ - Commit changes, push - Create a PR -## To check if supermodule and amp are in sync +### To check if supermodule and amp are in sync - Run the script: ``` > dev_scripts/git/git_submodules_are_updated.sh ``` -## Roll forward git submodules pointers: +### Roll forward git submodules pointers: - Run the script: ``` > dev_scripts/git/git_submodules_roll_fwd.sh ``` -## To clean all the repos +### To clean all the repos ``` > git submodule foreach git clean -fd ``` -## Pull a branch without checkout +### Pull a branch without checkout - This is useful when merging `master` in a different branch and we don't want to checkout master just to pull @@ -642,7 +644,7 @@ > git fetch origin master:master ``` -## To force updating all the submodules +### To force updating all the submodules - Run the script `> dev_scripts/git/git_submodules_pull.sh` or ``` diff --git a/docs/work_tools/all.invoke_workflows.how_to_guide.md b/docs/work_tools/all.invoke_workflows.how_to_guide.md index c261a99348..5f00712465 100644 --- a/docs/work_tools/all.invoke_workflows.how_to_guide.md +++ b/docs/work_tools/all.invoke_workflows.how_to_guide.md @@ -1,4 +1,4 @@ - +# Invoke Workflows @@ -15,8 +15,8 @@ + [Using git](#using-git) * [Systematic code transformation](#systematic-code-transformation) * [Generate a local `amp` Docker image](#generate-a-local-amp-docker-image) - * [Update the dev `amp` Docker image](#update-the-dev-amp-docker-image) - * [Experiment in a local image](#experiment-in-a-local-image) +- [Update the dev `amp` Docker image](#update-the-dev-amp-docker-image) +- [Experiment in a local image](#experiment-in-a-local-image) - [GitHub Actions (CI)](#github-actions-ci) - [pytest](#pytest) * [Run with coverage](#run-with-coverage) @@ -30,7 +30,7 @@ -# Introduction +## Introduction - We use `invoke` to implement workflows (aka "tasks") similar to Makefile targets, but using Python @@ -45,7 +45,7 @@ - Branch integration: `integrate_*` - Releasing tools and Docker images: `docker_*` - Lint: `lint_*` - - Pytest: + - Each set of commands starts with the name of the corresponding topic: - E.g., `docker_*` for all the tasks related to Docker - The best approach to getting familiar with the tasks is to browse the list and @@ -81,7 +81,7 @@ - Tabbing after typing a dash (-) or double dash (--) will display valid options/flags for the current context. -## Listing all the tasks +### Listing all the tasks - New commands are always being added, but a list of valid tasks is below @@ -172,7 +172,7 @@ traceback Parse the traceback from Pytest and navigate it with vim. ``` -## Getting help for a specific workflow +### Getting help for a specific workflow - You can get a more detailed help with @@ -201,7 +201,7 @@ -y STRING, --pytest-mark=STRING ``` -## Implementation details +### Implementation details - By convention all invoke targets are in `*_lib_tasks.py`, e.g., - `helpers/lib_tasks.py` - tasks to be run in `cmamp` @@ -229,15 +229,15 @@ - In other words one should do `cd cmamp/optimizer` before doing `i invoke_task2 ...` -# Git +## Git -## Merge master in the current branch +### Merge master in the current branch ```bash > i git_merge_master ``` -# GitHub +## GitHub - Get the official branch name corresponding to an Issue @@ -250,16 +250,16 @@ https://github.com/alphamatic/amp/pull/256 ``` -## Create a PR +### Create a PR TODO(gp): Describe -## Extract a PR from a larger one +### Extract a PR from a larger one - When having a PR which is really big we prefer to brake it into smaller mergeable PRs using `i git_branch_copy` -### Example +#### Example - In my workflow there is a feature branch (e.g. `CmTask5874_Document_PR_flow` that I am developing in. @@ -411,7 +411,7 @@ TODO(gp): Describe all the code is merged. ``` -### Using git +#### Using git ```bash > git checkout `dst_branch` @@ -419,11 +419,11 @@ TODO(gp): Describe > git reset HEAD ``` -## Systematic code transformation +### Systematic code transformation - See the help of `amp/dev_scripts/replace_text.py` -## Generate a local `amp` Docker image +### Generate a local `amp` Docker image - This is a manual flow used to test and debug images before releasing them to the team. @@ -508,10 +508,10 @@ TODO(gp): Describe - If you are running inside a notebook using `i docker_jupyter` you can install packages using a one liner `! sudo su -; source ...; ` -# GitHub Actions (CI) +## GitHub Actions (CI) ```bash -## Running a single test in GH Actions +### Running a single test in GH Actions Create a branch @@ -520,30 +520,30 @@ Change .github/workflows/fast_tests.yml run: invoke run_fast_tests --pytest-opts="helpers/test/test_git.py::Test_git_modified_files1::test_get_modified_files_in_branch1 -s --dbg" -# In the current implementation (where we try to not run for branches) to run in a branch +## In the current implementation (where we try to not run for branches) to run in a branch ``` -# pytest +## pytest - From https://gist.github.com/kwmiebach/3fd49612ef7a52b5ce3a - More details on running unit tests with `invoke` is [/docs/coding/all.run_unit_tests.how_to_guide.md](/docs/coding/all.run_unit_tests.how_to_guide.md) -## Run with coverage +### Run with coverage ```bash > i run_fast_tests --pytest-opts="core/test/test_finance.py" --coverage ``` -## Capture output of a pytest +### Capture output of a pytest - Inside the `dev` container (i.e., docker bash) ```bash docker> pytest_log ... ``` -## Run only one test based on its name +### Run only one test based on its name - Outside the `dev` container @@ -557,7 +557,7 @@ run: invoke run_fast_tests ./helpers/test/test_hobject.py::Test_obj_to_str1 ``` -## Iterate on stacktrace of failing test +### Iterate on stacktrace of failing test - Inside docker bash ```bash @@ -590,7 +590,7 @@ run: invoke run_fast_tests - The short form is `it` -## Iterating on a failing regression test +### Iterating on a failing regression test - The workflow is: @@ -603,7 +603,7 @@ run: invoke run_fast_tests > invoke pytest_repro ``` -## Detect mismatches with golden test outcomes +### Detect mismatches with golden test outcomes - The command is @@ -631,9 +631,9 @@ run: invoke run_fast_tests - For more details see [CmTask528](https://github.com/cryptokaizen/cmamp/issues/528). -# Lint +## Lint -## Lint everything +### Lint everything ```bash > i lint --phases="amp_isort amp_class_method_order amp_normalize_import diff --git a/docs/work_tools/all.latex_toolchain.how_to_guide.md b/docs/work_tools/all.latex_toolchain.how_to_guide.md index 0a46cc94fe..282804db57 100644 --- a/docs/work_tools/all.latex_toolchain.how_to_guide.md +++ b/docs/work_tools/all.latex_toolchain.how_to_guide.md @@ -1,12 +1,26 @@ -# Running and linting Latex files + + + + +- [Latex Toolchain](#latex-toolchain) + * [Running and linting Latex files](#running-and-linting-latex-files) + * [Embedding Mermaid and PlanUML figures](#embedding-mermaid-and-planuml-figures) + * [Finding citations](#finding-citations) + * [TODOs](#todos) + + + +# Latex Toolchain + +## Running and linting Latex files We organize each project is in a directory (e.g., under `//papers`) Under each dir there are two scripts: + - `run_latex.sh` -- `lint_latex.sh` -that assign some variables and then call the main scripts to perform the actual -work: +- `lint_latex.sh` that assign some variables and then call the main scripts to + perform the actual work: - `dev_scripts/latex/run_latex.sh` - `dev_scripts/latex/lint_latex.sh` @@ -30,7 +44,7 @@ To lint the Latex file: papers/DataFlow_stream_computing_framework/DataFlow_stream_computing_framework.tex 320ms (unchanged) ``` -# Embedding Mermaid and PlanUML figures +## Embedding Mermaid and PlanUML figures Update ./dev_scripts/documentation/render_md.py @@ -38,20 +52,17 @@ Update ./dev_scripts/documentation/render_md.py - It works on both Markdown and Latex files - Find a mermaid/plantuml block and then add an image -%```mermaid -%flowchart -% Vendor Data --> VendorDataReader --> DataReader --> User -%``` +%`mermaid %flowchart % Vendor Data --> VendorDataReader --> DataReader --> User %` -# Finding citations +## Finding citations -The simplest way is to use Google Scholar and then use the "Cite" option to get a -Bibtex entry +The simplest way is to use Google Scholar and then use the "Cite" option to get +a Bibtex entry Some interesting links are https://tex.stackexchange.com/questions/143/what-are-good-sites-to-find-citations-in-bibtex-format -# TODOs +## TODOs - Add a script to decorate the file with separators as part of the linting ``` @@ -68,4 +79,3 @@ https://tex.stackexchange.com/questions/143/what-are-good-sites-to-find-citation - Add a script to run a ChatGPT prompt on a certain chunk of text - Easily create a vimfile to navigate the TOC - diff --git a/docs/work_tools/all.pycharm.how_to_guide.md b/docs/work_tools/all.pycharm.how_to_guide.md index 3451ff56d1..5734e7b6d7 100644 --- a/docs/work_tools/all.pycharm.how_to_guide.md +++ b/docs/work_tools/all.pycharm.how_to_guide.md @@ -1,8 +1,11 @@ -# PyCharm +# Pycharm + +## PyCharm - [Current situation](#current-situation) +- [Current situation](#current-situation-1) - [How to run our cmamp container directly from PyCharm](#how-to-run-our-cmamp-container-directly-from-pycharm) - [How to review a PR inside Pycharm](#how-to-review-a-pr-inside-pycharm) - [How to edit remote code](#how-to-edit-remote-code) @@ -12,21 +15,22 @@ - [PUDB - remote debugging - ToDo](#pudb---remote-debugging---todo) - [How to run tests inside a container](#how-to-run-tests-inside-a-container) - [Installing PyCharm Professional](#installing-pycharm-professional) - - [Windows](#windows) - - [macOS](#macos) - - [Linux](#linux) + * [Windows](#windows) + * [macOS](#macos) + * [Linux](#linux) - [Connecting via PyCharm gateway (SSH)](#connecting-via-pycharm-gateway-ssh) - - [Connecting via VNC](#connecting-via-vnc) + * [Connecting via VNC](#connecting-via-vnc) - [Configuration](#configuration) - - [Reflow](#reflow) + * [Reflow](#reflow) - [Some recommended plug-ins](#some-recommended-plug-ins) -# Current situation + +## Current situation + There are multiple ways to develop on a remote server using PyCharm 1. VNC approach - - PyCharm runs locally on the server using a "virtual screen" - Your laptop interacts with a VNC server to get the GUI locally - Pros: @@ -37,7 +41,6 @@ There are multiple ways to develop on a remote server using PyCharm - Without enough bandwidth it's slow and not snappy enough 2. X11 approach - - Same as VNC, but instead of sending bitmaps through VNC, a "compressed" version of the GUI is sent to the local computer directly - Pros: @@ -49,7 +52,6 @@ There are multiple ways to develop on a remote server using PyCharm - One needs to tunnel X11 traffic, set things up, and so on 3. PyCharm Gateway - - New client-server architecture for PyCharm - A "headless" PyCharm runs on the server - A GUI client PyCharm runs on your laptop @@ -68,7 +70,7 @@ There are multiple ways to develop on a remote server using PyCharm - Cons - You can't run / debug remotely -# Current situation +## Current situation - Approach 1) seems to require lots of memory and CPU and it's not really fast. @@ -79,7 +81,7 @@ There are multiple ways to develop on a remote server using PyCharm - TODO(gp): @Juraj understand if it works, if it's fast, and if it requires less memory -# How to run our cmamp container directly from PyCharm +## How to run our cmamp container directly from PyCharm - PyCharm allows to run commands directly inside a container - See @@ -89,13 +91,13 @@ There are multiple ways to develop on a remote server using PyCharm - TODO(gp): @Juraj Let's both try this. There are some notes below about it -# How to review a PR inside Pycharm +## How to review a PR inside Pycharm - CTRL + SHIFT + A -> View Pull Request - -# How to edit remote code +## How to edit remote code - You need to use a certain local directory (e.g., /Users/saggese/src/commodity_research1) and a remote directory (e.g., @@ -129,23 +131,18 @@ pycharm - Run application remotely inside Docker -# General ssh config +## General ssh config - File | Settings | Tools | SSH Configurations - - - Once setup, ssh config can be used for all tools in PyCharm. - - Remote Interpreter - - DataGrip - - Deployment - - Etc. -# DB connection via ssh +## DB connection via ssh Note: PyCharm Professional DataGrip is used as an example. There are numerous open source alternatives such as [Beaver](https://dbeaver.io/). Config below @@ -153,24 +150,20 @@ should apply to them also. - To add a new data source in DataGrip, go to the database section in the lower left corner. - - - Then pick your desired data source from the dropdown in the upper right corner. - - - You will be presented with a dummy config that needs to be replaced with proper data as shown below. - - - -- Before that is done, be sure that proper ssh info is added in SSH/SSL section. +- Before that is done, be sure that proper ssh info is added in SSH/SSL section. - -# Deployment with remote repository (through sync) +## Deployment with remote repository (through sync) Note: Before setting up deployment, pull the cmamp repo on EC2 instance and use the same name as on your local machine (example: cmamp1). Always try to keep @@ -178,26 +171,22 @@ both repos in sync via git. For more subtle and simpler changes use File | Reload All From Disk . This will upload changes to the remote repo. - Tools | Deployment | Configuration - - - - - + - Tools | Deployment | Options - - - Uncheck "Skip external changes" and check "Delete remote files" - Tools | Deployment | Automatic Upload - - Check it - Tools | Deployment | Browse Remote Host - -# PUDB - remote debugging - ToDo +## PUDB - remote debugging - ToDo -# How to run tests inside a container +## How to run tests inside a container - [https://www.jetbrains.com/help/pycharm/using-docker-compose-as-a-remote-interpreter.html#docker-compose-remote](https://www.jetbrains.com/help/pycharm/using-docker-compose-as-a-remote-interpreter.html#docker-compose-remote) @@ -205,9 +194,9 @@ Reload All From Disk . This will upload changes to the remote repo. Professional Edition, while the terminal itself is available in both Professional and Community editions. -# Installing PyCharm Professional +## Installing PyCharm Professional -## Windows +### Windows 1. Download the installer using this [link](https://www.jetbrains.com/pycharm/download/#section=windows) @@ -215,7 +204,7 @@ Reload All From Disk . This will upload changes to the remote repo. 3. To run PyCharm, find it in the Windows Start menu or use the desktop shortcut. -## macOS +### macOS There are separate disk images for Intel and Apple Silicon processors. @@ -226,7 +215,7 @@ There are separate disk images for Intel and Apple Silicon processors. 3. Run the PyCharm app from the Applications directory, Launchpad, or Spotlight. -## Linux +### Linux **Using tar archives ** @@ -235,7 +224,6 @@ There are separate disk images for Intel and Apple Silicon processors. 2. Unpack the pycharm-\*.tar.gz file to a different folder, if your current Download folder doesn't support file execution: - ``` > tar xzf pycharm-*.tar.gz -C `` @@ -244,13 +232,11 @@ There are separate disk images for Intel and Apple Silicon processors. The recommended installation location according to the filesystem hierarchy standard (FHS) is `/opt`. To install PyCharm into this directory, enter the following command: - ``` > sudo tar xzf pycharm-\*.tar.gz -C /opt/ ``` 3. Switch to the **bin** subdirectory: - ``` > cd /pycharm-\*/bin # E.g., @@ -265,7 +251,6 @@ following command: **Using snap packages** 1. For Ubuntu 16.04 and later, you can use snap packages to install PyCharm. - ``` > sudo snap install pycharm-professional --classic # or @@ -281,7 +266,7 @@ following command: > pycharm-educational ``` -# Connecting via PyCharm gateway (SSH) +## Connecting via PyCharm gateway (SSH) The first thing you need to do is sign up for a free [trial license](https://www.jetbrains.com/ru-ru/remote-development/gateway/) or @@ -300,7 +285,7 @@ Then make sure you have a VPN connection to our VPC 8. Locate your directory. Example: /data/richard 9. Click on Download and Start IDE. -## Connecting via VNC +### Connecting via VNC - Make sure you have a VPN connection. @@ -312,8 +297,8 @@ Then make sure you have a VPN connection to our VPC Sysadmin has sent you: - `os_password.txt` -- your username `$USER` -- a key `crypto.pub` that looks like: +- Your username `$USER` +- A key `crypto.pub` that looks like: ``` -----BEGIN OPENSSH PRIVATE KEY----- b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAABlwAAAAdzc2gtcn @@ -326,21 +311,20 @@ Let's say you are connected via VNC. 1. Login into the OS. 2. Run pycharm.sh using terminal (should be there) - ``` > bash /opt/pycharm-community-2021.2.3/bin/pycharm.sh ``` -# Configuration +## Configuration -## Reflow +### Reflow - Set the reflow to reindent - -# Some recommended plug-ins +## Some recommended plug-ins -- vim +- Vim - Grazie - [Wrap-to-column](https://plugins.jetbrains.com/plugin/7234-wrap-to-column) - GitHub Copilot diff --git a/docs/work_tools/all.python_package_upgrade_and_troubleshooting.how_to_guide.md b/docs/work_tools/all.python_package_upgrade_and_troubleshooting.how_to_guide.md index 47fdb40327..0d69b9a01e 100644 --- a/docs/work_tools/all.python_package_upgrade_and_troubleshooting.how_to_guide.md +++ b/docs/work_tools/all.python_package_upgrade_and_troubleshooting.how_to_guide.md @@ -1,4 +1,4 @@ - +# Python package upgrade & troubleshooting diff --git a/docs/work_tools/all.ssh.how_to_guide.md b/docs/work_tools/all.ssh.how_to_guide.md index fd7f16ccc5..0a078e9d13 100644 --- a/docs/work_tools/all.ssh.how_to_guide.md +++ b/docs/work_tools/all.ssh.how_to_guide.md @@ -1,4 +1,4 @@ - +# Ssh @@ -8,7 +8,7 @@ -# What is SSH? +## What is SSH? From the Wikipedia @@ -19,14 +19,14 @@ From the Wikipedia More details [here](https://en.wikipedia.org/wiki/Secure_Shell) -# How we use ssh in our company? +## How we use ssh in our company? - We use it to connect to any of our servers. - Sometimes we use `scp` to copy files between hosts via `ssh`. - Don't know what is `scp`? read [here](https://haydenjames.io/linux-securely-copy-files-using-scp/) -# Public key for authorization? +## Public key for authorization? - We use `public key` authorization. This is the common way of secure authorization for SSH connection. diff --git a/docs/work_tools/all.visual_studio_code.how_to_guide.md b/docs/work_tools/all.visual_studio_code.how_to_guide.md index 9a175eb309..b00c529d9b 100644 --- a/docs/work_tools/all.visual_studio_code.how_to_guide.md +++ b/docs/work_tools/all.visual_studio_code.how_to_guide.md @@ -1,5 +1,7 @@ # Visual Studio Code +## Visual Studio Code + - [Connecting via VNC](#connecting-via-vnc) @@ -14,20 +16,18 @@ -# Connecting via VNC +## Connecting via VNC - Make sure you have a VPN connection. -## Installing VNC +### Installing VNC - Install VNC using this link: [https://www.realvnc.com/en/connect/download/viewer/windows/](https://www.realvnc.com/en/connect/download/viewer/windows/) - Sysadmin has sent you: - - `os_password.txt` - - your username `$USER` - - a key `crypto.pub` that looks like: - + - Your username `$USER` + - A key `crypto.pub` that looks like: ``` -----BEGIN OPENSSH PRIVATE KEY----- b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAABlwAAAAdzc2gtcn @@ -38,17 +38,15 @@ ``` - Let's say you are connected via VNC. - - Login into the OS. - Run `pycharm.sh` using terminal (should be there): - ``` > bash /opt/pycharm-community-2021.2.3/bin/pycharm.sh ``` -# Installation of VS Code +## Installation of VS Code -## Windows, Linux, Mac +### Windows, Linux, Mac - Download the installer using this link: [Download Visual Studio Code - Mac, Linux, Windows](https://code.visualstudio.com/download). @@ -80,33 +78,27 @@ - Choose the desired repo directory from the drop-down menu, e.g., `cmamp1`: -# How to run a VSCode debugger within a remote container +## How to run a VSCode debugger within a remote container The goal is to successfully run a Visual Studio Code (VSCode) debugger on code that runs within a docker container located on a remote server. -## Prerequisites +### Prerequisites - Mac OS or Linux-based OS - Visual Studio Code installed on the local machine you are working from - VSCode extensions installed: - - [Python](https://marketplace.visualstudio.com/items?itemName=ms-python.python) (Installed on the remote machine) - - VS Code installs some files inside `.vscode-*` directories on the remote host to ensure full functionality of the extensions - - [Remote SSH](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-ssh) - - [Remote Explorer](https://marketplace.visualstudio.com/items?itemName=ms-vscode.remote-explorer) - - [Remote Development](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.vscode-remote-extensionpack) (Installed on the remote machine) - - [Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) - Ability to access the remote server where the container will be located using @@ -118,11 +110,11 @@ that runs within a docker container located on a remote server. - A running Kaizen dev docker container created on the remote machine using `invoke docker_bash` -## Tips +### Tips - To open the command palette in VSCode use a keyboard shortcut Cmd + Shift + P -## Steps +### Steps 1. Open the command palette and search for `Remote-SSH: Connect to host` action @@ -153,7 +145,6 @@ that runs within a docker container located on a remote server. 8. Paste the following JSON into the file and save it. - ``` { // Use IntelliSense to learn about possible attributes. @@ -190,7 +181,6 @@ that runs within a docker container located on a remote server. `"args"` key to the launch.json file (the location should be `.vscode/launch.json`). The value is a list of command line arguments and values. Example below: - ``` "args": [ "--file", @@ -200,12 +190,12 @@ that runs within a docker container located on a remote server. ] ``` -# How to access the Jupyter server running on the remote server through your local machine +## How to access the Jupyter server running on the remote server through your local machine 1.`i docker_jupyter` ```sh -##> devops/docker_run/run_jupyter_server.sh +###> devops/docker_run/run_jupyter_server.sh > cmd=jupyter notebook --ip=* --port=10421 --allow-root --NotebookApp.token='' ... ``` diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000000..2d539dc0d1 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,20 @@ +site_name: KaizenAI Documentation +theme: + name: material + palette: + primary: black + accent: cyan + logo: mkdocs/assets/logo.png + favicon: mkdocs/assets/favicon.ico +# Helps render images and other assets correctly. +use_directory_urls: false +# The search plugin is enabled by default +# but if we specify plugins, we need to include it +# explicitly. +plugins: + - search + # Used for filtering the explicit table of contents + # because MkDocs generates its own one + - mkdocs-toc-tag-filter +extra_css: + - mkdocs/styles/styles.css \ No newline at end of file diff --git a/mkdocs/mkdocs-toc-tag-filter/__init__.py b/mkdocs/mkdocs-toc-tag-filter/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mkdocs/mkdocs-toc-tag-filter/plugin.py b/mkdocs/mkdocs-toc-tag-filter/plugin.py new file mode 100644 index 0000000000..ca321fee8b --- /dev/null +++ b/mkdocs/mkdocs-toc-tag-filter/plugin.py @@ -0,0 +1,32 @@ +""" +Simple plugin to filter out the table of contents from the markdown content. + +- The MkDocs generates its own table of contents based on the markdown +headers we want to avoid the redundancy when rendering the markdown content +on the website. + +Import as: + +import mkdocs.mkdocs-toc-tag-filter.plugin as mmkdplug +""" + +import re + +from mkdocs.config import config_options +from mkdocs.plugins import BasePlugin + +_PATTERN_TO_FILTER = r"[\s\S]*?" + + +class TocFilterPlugin(BasePlugin): + config_scheme = (("param", config_options.Type(str, default="")),) + + def __init__(self): + self.enabled = True + self.total_time = 0 + + def on_page_markdown(self, markdown, page, config, files): + filtered_markdown = re.sub( + _PATTERN_TO_FILTER, "", markdown, flags=re.DOTALL + ) + return filtered_markdown diff --git a/mkdocs/mkdocs-toc-tag-filter/setup.py b/mkdocs/mkdocs-toc-tag-filter/setup.py new file mode 100644 index 0000000000..3b1db2e4bf --- /dev/null +++ b/mkdocs/mkdocs-toc-tag-filter/setup.py @@ -0,0 +1,20 @@ +from setuptools import find_packages, setup + +setup( + name="mkdocs-toc-tag-filter", + version="0.1.0", + description="A MkDocs plugin to filter tag content", + long_description="", + keywords="mkdocs", + url="", + author="", + author_email="", + license="MIT", + python_requires=">=3.8", + install_requires=["mkdocs>=1.0.4"], + classifiers=["Programming Language :: Python :: 3.8"], + packages=find_packages(), + entry_points={ + "mkdocs.plugins": ["mkdocs-toc-tag-filter = plugin:TocFilterPlugin"] + }, +)