feat: Set network dynamically + docker improvements (#9)

* Improves documentation * Allows configuration of different URLs for querying the OFF API * Switches to the default network, which was validated in testing
openfoodfacts · Aug 26, 2022 · 7b9df96 · 7b9df96
1 parent 6d994a0
commit 7b9df96
Show file tree

Hide file tree

Showing 8 changed files with 73 additions and 40 deletions.
diff --git a/.env b/.env
@@ -8,9 +8,11 @@ CLUSTER_NAME=docker-cluster
 LICENSE=basic
 
 # Port to expose Elasticsearch HTTP API to the host
-ES_PORT=9200
-#ES_PORT=127.0.0.1:9200
+ES_PORT=127.0.0.1:9200
 
 # Increase or decrease based on the available host memory (in bytes)
 # 1GB works well, 2GB and above leads to lower latency
 MEM_LIMIT=4294967296
+
+# on dev connect to the same network as off-server
+COMMON_NET_NAME = po_default
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@ Open Food Facts Search API V3 using ElasticSearch - https://wiki.openfoodfacts.o
 
 This API is currently in development. It is not serving any production traffic. The [Work Plan](https://wiki.openfoodfacts.org/Search_API_V3#Work_Plan) will be updated as development continues.
 
-The file product.schema.json contains the schema of the returned products.
+The file product.schema.json contains a partial schema of the returned products.
 
 ### Organization
 The main file is `api.py`, and the Product schema is in `models/product.py`.
@@ -20,8 +20,9 @@ Docker spins up:
 - Two elasticsearch nodes
 - [Elasticvue](https://elasticvue.com/)
 - The search service on port 8000
+- Redis on port 6379
 
-You will then need to import from CSV (see instructions below).
+You will then need to import from MongoDB (see instructions below).
 
 ### Development
 For development, you have two options for running the service:
@@ -30,8 +31,7 @@ For development, you have two options for running the service:
 
 To develop on docker, make the changes you need, then build the image and compose by running:
 ```console
-docker build -t off_search_image .
-docker-compose up -d
+docker build -t off_search_image . && docker-compose up -d
 ```
 
 However, this tends to be slower than developing locally.
@@ -51,7 +51,21 @@ This repo uses [pre-commit](https://pre-commit.com/) to enforce code styling, et
 pre-commit run
 ```
 
-### Helpful commands:
+### Running the import:
 To import data from the [MongoDb export](https://world.openfoodfacts.org/data):
+1. First ensure that your docker environment has at least 150GB of disk and 6GB of RAM. This can be found under settings --> resources
+2. Run the following command:
 ```console
-python scripts/perform_import_parallel.py --filename=/path/to/file.csv
+python scripts/perform_import_parallel.py --filename=/path/to/file.csv --num_processes=2
+```
+If you get errors, try adding more RAM (12GB works well if you have that spare), or slow down the indexing process by setting `num_processes` to 1 in the command above.
+
+Typical import time is 1-1.5 hours on an M1 Macbook.
+
+### Testing via CLI:
+Under `scripts/` there are scripts that allow you to send requests to the service, ES or Redis.
+
+For example, to run the autocomplete query on the local docker instance, do:
+```console
+python scripts/http_autocomplete_query.py --port=8000
+```
diff --git a/app/import_queue/product_client.py b/app/import_queue/product_client.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import os
+
 import requests
 
 
@@ -8,7 +10,13 @@ def __init__(self):
         pass
 
     def get_product(self, code):
-        url = 'https://world.openfoodfacts.org/api/v2/product/{}'.format(code)
+        url = '{}/api/v2/product/{}'.format(
+            os.getenv(
+                'OPENFOODFACTS_API_URL',
+                'https://world.openfoodfacts.org',
+            ),
+            code,
+        )
         response = requests.get(url)
         json_response = response.json()
         if not json_response or not json_response.get('product'):

diff --git a/app/import_queue/queue_manager.py b/app/import_queue/queue_manager.py
@@ -21,11 +21,10 @@ def consume(self):
             if not item:
                 print('Unable to retrieve product with code {}'.format(code))
                 continue
-
             # As the code is unique (set in the save method), this will handle updates as well as new documents
             product = create_product_from_dict(item)
             product.save()
-            print(f'Recieved Redis update for product: {product.product_name}')
+            print(f'Received Redis update for product: {product.product_name}')
 
             # Now, write a key that can be read for full imports
             self.redis_client.write_processed(product.code)

diff --git a/app/models/product.py b/app/models/product.py
@@ -53,9 +53,10 @@ def create_product_from_dict(d):
 
 class Product(Document):
     """
-    This should mirror the fields here: https://github.com/openfoodfacts/openfoodfacts-server/blob/main/html/data/data-fields.txt
-    Use scripts/generate_product_from_data_fields.py to regenerate from data-fields.txt, but be careful for manual
-    adjustments
+    This was initially created with the scripts/generate_schema.py script. However, note that there have been manual
+    adjustments.
+
+    Furthermore, additional fields are added at index time, so below is just a subset of the available fields.
     """
 
     class Index:

diff --git a/app/scripts/http_autocomplete_query.py b/app/scripts/http_autocomplete_query.py
@@ -3,6 +3,7 @@
 """
 from __future__ import annotations
 
+import argparse
 import json
 import time
 
@@ -11,7 +12,7 @@
 from app.utils import connection
 
 
-def manual_query():
+def manual_query(hostname, port):
     connection.get_connection()
 
     while True:
@@ -25,7 +26,7 @@ def manual_query():
             'response_fields': ['product_name'],
         }
         response = requests.post(
-            'http://127.0.0.1:8001/autocomplete', json=payload,
+            '{}:{}/autocomplete'.format(hostname, port), json=payload,
         )
         print(json.dumps(response.json(), indent=4, sort_keys=True))
         print(f'Number of results: {len(response.json())}')
@@ -34,4 +35,12 @@ def manual_query():
 
 
 if __name__ == '__main__':
-    manual_query()
+    parser = argparse.ArgumentParser('http_autocomplete_query')
+    parser.add_argument(
+        '--hostname', type=str, default='http://127.0.0.1',
+    )
+    parser.add_argument(
+        '--port', type=int, default=8000,
+    )
+    args = parser.parse_args()
+    manual_query(args.hostname, args.port)
diff --git a/app/scripts/http_search_query.py b/app/scripts/http_search_query.py
@@ -3,6 +3,7 @@
 """
 from __future__ import annotations
 
+import argparse
 import json
 import time
 
@@ -11,7 +12,7 @@
 from app.utils import connection
 
 
-def manual_query():
+def manual_query(hostname, port, field):
     connection.get_connection()
 
     while True:
@@ -21,11 +22,12 @@ def manual_query():
         payload = {
             'string_filters': [
                 {
-                    'field': 'code',
+                    'field': field,
                     'value': search_term,
                     'operator': 'eq',
                 },
             ],
+            # To test more advanced features, uncomment the below
             # 'numeric_filters': [
             #     {
             #         'field': 'nutriments.sodium_value',
@@ -48,7 +50,9 @@ def manual_query():
             'num_results': 10,
             # 'response_fields': ['product_name', 'states_tags'],
         }
-        response = requests.post('http://127.0.0.1:8001/search', json=payload)
+        response = requests.post(
+            '{}:{}/search'.format(hostname, port), json=payload,
+        )
         print(
             json.dumps(
                 response.json(), indent=4,
@@ -61,4 +65,15 @@ def manual_query():
 
 
 if __name__ == '__main__':
-    manual_query()
+    parser = argparse.ArgumentParser('http_search_query')
+    parser.add_argument(
+        '--hostname', type=str, default='http://127.0.0.1',
+    )
+    parser.add_argument(
+        '--port', type=int, default=8000,
+    )
+    parser.add_argument(
+        '--field', help='Field to search', type=str, default='code',
+    )
+    args = parser.parse_args()
+    manual_query(args.hostname, args.port, args.field)
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -83,10 +83,6 @@ services:
       interval: 10s
       timeout: 10s
       retries: 120
-    networks:
-      - webnet
-    storage_opt:
-      size: '200G'
 
   es02:
     depends_on:
@@ -121,46 +117,35 @@ services:
       interval: 10s
       timeout: 10s
       retries: 120
-    networks:
-      - webnet
-    storage_opt:
-      size: '200G'
 
   # elasticsearch browser
   elasticvue:
       image: cars10/elasticvue
-      container_name: elasticvue
       ports:
           - '8080:8080'
       links:
           - es01
-      networks:
-        - webnet
 
   searchservice:
     image: off_search_image
-    container_name: searchservice
     environment:
       - ELASTICSEARCH_URL=host.docker.internal:9200
       - REDIS_HOST=host.docker.internal
     ports:
       - '8000:8000'
     networks:
-      - webnet
+      - common_net
 
   searchredis:
     image: redis
-    container_name: redis
     restart: always
     ports:
       - '6379:6379'
     command: redis-server --save 20 1 --loglevel warning
     volumes:
       - rediscache:/data
     networks:
-      - webnet
-
-
+      - common_net
 
 volumes:
   certs:
@@ -173,5 +158,5 @@ volumes:
     driver: local
 
 networks:
-  webnet:
-    name: webnet
+  common_net:
+    name: ${COMMON_NET_NAME}