Skip to content

Commit

Permalink
Merge pull request #17 from PDOK/PDOK-17192/synoniem-subst
Browse files Browse the repository at this point in the history
feat: use synonyms cvs file to generate extra suggest records
  • Loading branch information
kad-dirc authored Dec 16, 2024
2 parents acea417 + 9ef1454 commit 22eb8bd
Show file tree
Hide file tree
Showing 9 changed files with 150 additions and 84 deletions.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,14 @@ docker run --rm --name postgis -e POSTGRES_PASSWORD=postgres -e POSTGRES_DB=test

./gomagpie create-search-index --db-name test_db

./gomagpie import-file --db-name test_db --config-file internal/etl/testdata/config.yaml --file internal/etl/testdata/addresses-crs84.gpkg --substitution-file internal/etl/testdata/substitution.csv --feature-table "addresses"

./gomagpie import-file --db-name test_db \
--file internal/etl/testdata/addresses-crs84.gpkg \
--feature-table "addresses" \
--config-file internal/etl/testdata/config.yaml \
--collection-id "addresses" \
--substitutions-file internal/etl/testdata/substitutions.csv \
--synonyms-file internal/etl/testdata/synonyms.csv

```

## Build
Expand Down
15 changes: 11 additions & 4 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ const (
featureTableFidFlag = "fid"
featureTableGeomFlag = "geom"
pageSizeFlag = "page-size"
substitutionFileFlag = "substitution-file"
substitutionsFileFlag = "substitutions-file"
synonymsFileFlag = "synonyms-file"
)

var (
Expand Down Expand Up @@ -240,11 +241,17 @@ func main() {
Required: true,
},
&cli.PathFlag{
Name: substitutionFileFlag,
EnvVars: []string{strcase.ToScreamingSnake(substitutionFileFlag)},
Name: substitutionsFileFlag,
EnvVars: []string{strcase.ToScreamingSnake(substitutionsFileFlag)},
Usage: "Path to csv file containing substitutions used to generate suggestions",
Required: true,
},
&cli.PathFlag{
Name: synonymsFileFlag,
EnvVars: []string{strcase.ToScreamingSnake(synonymsFileFlag)},
Usage: "Path to csv file containing synonyms used to generate suggestions",
Required: true,
},
&cli.StringFlag{
Name: featureTableFidFlag,
EnvVars: []string{strcase.ToScreamingSnake(featureTableFidFlag)},
Expand Down Expand Up @@ -289,7 +296,7 @@ func main() {
if collection == nil {
return fmt.Errorf("no configured collection found with id: %s", collectionID)
}
return etl.ImportFile(*collection, c.String(searchIndexFlag), c.Path(fileFlag), c.Path(substitutionFileFlag), featureTable,
return etl.ImportFile(*collection, c.String(searchIndexFlag), c.Path(fileFlag), c.Path(substitutionsFileFlag), c.Path(synonymsFileFlag), featureTable,
c.Int(pageSizeFlag), dbConn)
},
},
Expand Down
6 changes: 3 additions & 3 deletions internal/etl/etl.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ type Extract interface {
type Transform interface {

// Transform each raw record in one or more search records depending on the given configuration
Transform(records []t.RawRecord, collection config.GeoSpatialCollection, substitutionFile string) ([]t.SearchIndexRecord, error)
Transform(records []t.RawRecord, collection config.GeoSpatialCollection, substitutionsFile string, synonymsFile string) ([]t.SearchIndexRecord, error)
}

// Load - the 'L' in ETL. Datasource agnostic interface to load data into target database.
Expand All @@ -52,7 +52,7 @@ func CreateSearchIndex(dbConn string, searchIndex string) error {
}

// ImportFile import source data into target search index using extract-transform-load principle
func ImportFile(collection config.GeoSpatialCollection, searchIndex string, filePath string, substitutionFile string, table config.FeatureTable,
func ImportFile(collection config.GeoSpatialCollection, searchIndex string, filePath string, substitutionsFile string, synonymsFile string, table config.FeatureTable,
pageSize int, dbConn string) error {

log.Println("start importing")
Expand Down Expand Up @@ -84,7 +84,7 @@ func ImportFile(collection config.GeoSpatialCollection, searchIndex string, file
if len(sourceRecords) == 0 {
break // no more batches of records to extract
}
targetRecords, err := transformer.Transform(sourceRecords, collection, substitutionFile)
targetRecords, err := transformer.Transform(sourceRecords, collection, substitutionsFile, synonymsFile)
if err != nil {
return fmt.Errorf("failed to transform raw records to search index records: %w", err)
}
Expand Down
4 changes: 2 additions & 2 deletions internal/etl/etl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func TestImportGeoPackage(t *testing.T) {
{
name: "import everything",
where: "",
count: 67230, // 33030*2 + substitution mutations
count: 67230, // 33030*2 + substitution and synonyms combinations
},
{
name: "with where clause",
Expand Down Expand Up @@ -97,7 +97,7 @@ func TestImportGeoPackage(t *testing.T) {
assert.NoError(t, err)

table := config.FeatureTable{Name: "addresses", FID: "fid", Geom: "geom"}
err = ImportFile(*collection, "search_index", pwd+"/testdata/addresses-crs84.gpkg", pwd+"/testdata/substitution.csv", table, 1000, dbConn)
err = ImportFile(*collection, "search_index", pwd+"/testdata/addresses-crs84.gpkg", pwd+"/testdata/substitutions.csv", pwd+"/testdata/synonyms.csv", table, 1000, dbConn)
assert.NoError(t, err)

// check nr of records
Expand Down
File renamed without changes.
3 changes: 3 additions & 0 deletions internal/etl/testdata/synonyms.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
eerste,1ste
tweede,2de
fryslân,friesland
105 changes: 66 additions & 39 deletions internal/etl/transform/extend_values.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,46 @@ import (
"encoding/csv"
"os"
"strings"

"github.com/PDOK/gomagpie/internal/engine/util"
)

func generateFieldValuesSubstitutions(fieldValuesByName map[string]any, substitutionFile string) ([]map[string]any, error) {
substitutions, err := readSubstitutionsFile(substitutionFile)
// Return slice of fieldValuesByName
func extendFieldValues(fieldValuesByName map[string]any, substitutionsFile, synonymsFile string) ([]map[string]any, error) {
substitutions, err := readCsvFile(substitutionsFile)
if err != nil {
return nil, err
}
synonyms, err := readCsvFile(synonymsFile)
if err != nil {
return nil, err
}
var fieldValuesByNameExtensions = make(map[string][]string)

var fieldValuesByNameWithAllValues = make(map[string][]string)
for key, value := range fieldValuesByName {
valueSubstitutions, err := applySubstitutions(value.(string), substitutions)
valueLower := strings.ToLower(value.(string))

// Get all substitutions
substitutedValues, err := extendValues([]string{valueLower}, substitutions)
if err != nil {
return nil, err
}
fieldValuesByNameExtensions[key] = valueSubstitutions
// Get all synonyms for these substituted values
// one way
synonymsValuesOneWay, err := extendValues(substitutedValues, synonyms)
if err != nil {
return nil, err
}
// reverse way
allValues, err := extendValues(synonymsValuesOneWay, util.Inverse(synonyms))
if err != nil {
return nil, err
}

// Create map with for each key a slice of []values
fieldValuesByNameWithAllValues[key] = allValues
}
return generateAllFieldValuesByName(fieldValuesByNameExtensions), err
return generateAllFieldValuesByName(fieldValuesByNameWithAllValues), err
}

// Transform a map[string][]string into a []map[string]string using the cartesian product, i.e.
Expand All @@ -34,50 +58,53 @@ func generateAllFieldValuesByName(input map[string][]string) []map[string]any {
values = append(values, vals)
}

return generateCombinations(keys, values, 0, make(map[string]any))
return generateCombinations(keys, values)
}

// Recursively generate all combinations
func generateCombinations(keys []string, values [][]string, keyDepth int, current map[string]any) []map[string]any {
var result []map[string]any
if keyDepth == len(keys) {
newEntry := make(map[string]any)
for k, v := range current {
newEntry[k] = v
}
return []map[string]any{newEntry}
func generateCombinations(keys []string, values [][]string) []map[string]any {
if len(keys) == 0 || len(values) == 0 {
return nil
}

for _, val := range values[keyDepth] {
current[keys[keyDepth]] = val
partialResult := generateCombinations(keys, values, keyDepth+1, current)
result = append(result, partialResult...)
result := []map[string]any{{}} // contains empty map so the first iteration works
for keyDepth := 0; keyDepth < len(keys); keyDepth++ {
var newResult []map[string]any
for _, entry := range result {
for _, val := range values[keyDepth] {
newEntry := make(map[string]any)
for k, v := range entry {
newEntry[k] = v
}
newEntry[keys[keyDepth]] = val
newResult = append(newResult, newEntry)
}
}
result = newResult
}

return result
}

func applySubstitutions(input string, substitutions map[string]string) ([]string, error) {
inputLower := strings.ToLower(input)
func extendValues(input []string, mapping map[string]string) ([]string, error) {
var results []string
results = append(results, inputLower)

for oldChar, newChar := range substitutions {
if strings.Contains(inputLower, oldChar) {
for i := 0; i < strings.Count(inputLower, oldChar); i++ {
substituted, err := replaceNth(inputLower, oldChar, newChar, i+1)
if err != nil {
return nil, err
results = append(results, input...)

for j := range input {
for oldChar, newChar := range mapping {
if strings.Contains(input[j], oldChar) {
for i := 0; i < strings.Count(input[j], oldChar); i++ {
extendedInput, err := replaceNth(input[j], oldChar, newChar, i+1)
if err != nil {
return nil, err
}
subCombinations, err := extendValues([]string{extendedInput}, mapping)
if err != nil {
return nil, err
}
results = append(results, subCombinations...)
}
subCombinations, err := applySubstitutions(substituted, substitutions)
if err != nil {
return nil, err
}
results = append(results, subCombinations...)
}
}
}

// Possible performance improvement here by avoiding duplicates in the first place
return uniqueSlice(results), nil
}

Expand Down Expand Up @@ -114,7 +141,7 @@ func uniqueSlice(s []string) []string {
return results
}

func readSubstitutionsFile(filepath string) (map[string]string, error) {
func readCsvFile(filepath string) (map[string]string, error) {
substitutions := make(map[string]string)

file, err := os.Open(filepath)
Expand Down
Loading

0 comments on commit 22eb8bd

Please sign in to comment.