-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathDAWG_Compressor.c
777 lines (692 loc) · 37.5 KB
/
DAWG_Compressor.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
// This program demonstrates the sleek construction of a lexicon data structure known as the "Directed Acyclic Word Graph".
// Weighing in at nearly 800 lines, this algorithm was not forged through trivial and comprehensive documentation.
// To set up this traditional DAWG for creation, set up the following 3 conditions.
// 1) "Lexicon.txt" is a word list, where all characters are capital, and the number of words is written on the very first line. Linux format, so no DOS CR character.
// 2) Set the constant MAX below to the length of the longest word in the lexicon. One letter words are invalid in this program.
// 3) Understand the procedures for extracting information from the unsigned integer nodes.
// Keep in mind that traversing a traditional DAWG is only optimal when it takes place in alphabetical order.
// The unsigned integer encoding that we are using for this "Dawg" allows for up to 33554432 nodes. That is very many more nodes than an current English lexicon will require.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#define MAX 15
#define MIN 2
#define NUMBER_OF_REDUCED_LETTERS 14
#define INPUT_LIMIT 30
#define BIG_IT_UP -32
#define LETTER_BIT_SHIFT 25
#define LETTER_BIT_MASK 1040187392
#define CHILD_INDEX_BIT_MASK 33554431
#define END_OF_WORD_BIT_MASK 2147483648
#define END_OF_LIST_BIT_MASK 1073741824
#define BINARY_STRING_LENGTH 38
// we will store the DAWG into a ".txt" file to verify the output, and a ".dat" file for use.
#define RAW_LEXICON "Lexicon.txt"
#define DAWG_DATA "Dawg_For_Lexicon.dat"
#define DAWG_TEXT_DATA "Dawg_Text_For_Lexicon.txt"
// When reading strings from a file, the new-line character is appended, and this macro will remove it before processing.
#define CUT_OFF_NEW_LINE(string) (string[strlen(string) - 1] = '\0')
// C requires a boolean variable type so use C's typedef concept to create one.
typedef enum { FALSE = 0, TRUE = 1 } Bool;
typedef Bool* BoolPtr;
// This array streamlines the conversion of digit strings into integers. Unsigned integers do not exceed the low billions, so ten numbers will suffice.
unsigned int PowersOfTen[10] = {1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000};
unsigned int PowersOfTwo[32] = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824, 2147483648};
// Returns the unsigned integer rerpresented by "TheNumberNotYet" string, and if not a number greater then zero, returns 0.
unsigned int StringToUnsignedInt(char *TheNumberNotYet){
unsigned int Result = 0;
unsigned int X;
unsigned int Digits = strlen(TheNumberNotYet);
for ( X = 0; X < Digits; X++ ) {
if ( !(TheNumberNotYet[X] >= '0' && TheNumberNotYet[X] <= '9') ) {
printf("TheNumberNotYet[X] not a numeral ?? |%c|\n", TheNumberNotYet[X]);
return 0;
}
unsigned int power = PowersOfTen[Digits - X - 1];
printf("power |%d|\n", power);
Result += (TheNumberNotYet[X] - '0')*power;
}
return Result;
}
// This Function converts any lower case letters in the string "RawWord", into capitals, so that the whole string is made of capital letters.
void MakeMeAllCapital(char *RawWord){
unsigned int X;
for ( X = 0; X < strlen(RawWord); X++ ){
if (RawWord[X] >= 'a' && RawWord[X] <= 'z' ) RawWord[X] = RawWord[X] + BIG_IT_UP;
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// "Tnode" functionality. These will be the nodes contained in a "Trie" designed to convert into a "Dawg".
struct tnode {
struct tnode* Next;
struct tnode* Child;
struct tnode* ParentalUnit;
// When populating the array, you must know the indices of child nodes. Hence we Store "ArrayIndex" in every node so that we can mine the information from the reduced "Trie".
int ArrayIndex;
char DirectChild;
char Letter;
char MaxChildDepth;
char Level;
char NumberOfChildren;
char Dangling;
char EndOfWordFlag;
};
typedef struct tnode Tnode;
typedef Tnode* TnodePtr;
// Extracting "Tnode" member data will consume a lot of time when finding redundant nodes, so define these functions as macros.
// The only negative effect is that the programmer must be diligent to use these constructs on the right data. Otherwise, this decision makes the program faster.
#define TNODE_ARRAY_INDEX(thistnode) (thistnode->ArrayIndex)
#define TNODE_DIRECT_CHILD(thistnode) (thistnode->DirectChild)
#define TNODE_NEXT(thistnode) (thistnode->Next)
#define TNODE_CHILD(thistnode) (thistnode->Child)
#define TNODE_PARENTAL_UNIT(thistnode) (thistnode->ParentalUnit)
#define TNODE_LETTER(thistnode) (thistnode->Letter)
#define TNODE_MAX_CHILD_DEPTH(thistnode) (thistnode->MaxChildDepth)
#define TNODE_NUMBER_OF_CHILDREN(thistnode) (thistnode->NumberOfChildren)
#define TNODE_END_OF_WORD_FLAG(thistnode) (thistnode->EndOfWordFlag)
#define TNODE_LEVEL(thistnode) (thistnode->Level)
#define TNODE_DANGLING(thistnode) (thistnode->Dangling)
TnodePtr TnodeInit(char Chap, TnodePtr OverOne, char WordEnding, char Leveler, int StarterDepth, TnodePtr Parent, char IsaChild){
TnodePtr Result = (Tnode*)malloc(sizeof(Tnode));
Result->Letter = Chap;
Result->ArrayIndex = 0;
Result->NumberOfChildren = 0;
Result->MaxChildDepth = StarterDepth;
Result->Next = OverOne;
Result->Child = NULL;
Result->ParentalUnit = Parent;
Result->Dangling = FALSE;
Result->EndOfWordFlag = WordEnding;
Result->Level = Leveler;
Result->DirectChild = IsaChild;
return Result;
}
// Define all of the member-setting functions as macros for faster performance in "Trie" creation.
#define TNODE_SET_ARRAY_INDEX(thistnode, thewhat) (thistnode->ArrayIndex = thewhat)
#define TNODE_SET_CHILD(thistnode, nexis) (thistnode->Child = nexis)
#define TNODE_SET_NEXT(thistnode, nexi) (thistnode->Next = nexi)
#define TNODE_SET_PARENTAL_UNIT(thistnode, beforeme) (thistnode->ParentalUnit = beforeme)
#define TNODE_ADD_ONE_CHILD(thistnode) (thistnode->NumberOfChildren += 1)
#define TNODE_SET_MAX_CHILD_DEPTH(thistnode, newdepth) (thistnode->MaxChildDepth = newdepth)
#define TNODE_SET_DIRECT_CHILD(thistnode, status) (thistnode->DirectChild = status)
#define TNODE_FLY_END_OF_WORD_FLAG(thistnode) (thistnode->EndOfWordFlag = TRUE)
#define TNODE_DANGLE_ME(thistnode) (thistnode->Dangling = TRUE)
// This function Dangles a node, but also recursively dangles every node under it as well, that way nodes that are not direct children do hit the chopping block.
// The function returns the total number of nodes dangled as a result.
int TnodeDangle(TnodePtr ThisTnode){
int Result = 0;
if ( TNODE_DANGLING(ThisTnode) == TRUE ) return 0;
if ( TNODE_NEXT(ThisTnode) != NULL ) Result += TnodeDangle(TNODE_NEXT(ThisTnode));
if ( TNODE_CHILD(ThisTnode) != NULL ) Result += TnodeDangle(TNODE_CHILD(ThisTnode));
TNODE_DANGLE_ME(ThisTnode);
Result += 1;
return Result;
}
// This function returns the pointer to the Tnode in a parallel list of nodes with the letter "ThisLetter", and returns NULL if the Tnode does not exist.
// In the "NULL" case, an insertion will be required.
TnodePtr TnodeFindParaNode(TnodePtr ThisTnode, char ThisLetter){
if ( ThisTnode == NULL ) return NULL;
TnodePtr Result = ThisTnode;
if ( TNODE_LETTER(Result) == ThisLetter ) return Result;
while ( TNODE_LETTER(Result) < ThisLetter ) {
Result = TNODE_NEXT(Result);
if ( Result == NULL ) return NULL;
}
if ( TNODE_LETTER(Result) == ThisLetter ) return Result;
else return NULL;
}
// This function inserts a new Tnode before a larger letter node or at the end of a Para-List If the list does not exist then it is put at the beginnung.
// The new node has "ThisLetter" in it. "AboveTnode" is the node 1 level above where the new node will be placed.
// This function should never be passed a "NULL" pointer. "ThisLetter" should never exist in the "Child" Para-List.
void TnodeInsertParaNode(TnodePtr AboveTnode, char ThisLetter, char WordEnder, int StartDepth){
TNODE_ADD_ONE_CHILD(AboveTnode);
TnodePtr Holder = NULL;
TnodePtr Currently = NULL;
// Case 1: Para-List does not exist yet, so start it.
if ( TNODE_CHILD(AboveTnode) == NULL ) TNODE_SET_CHILD(AboveTnode, TnodeInit(ThisLetter, NULL, WordEnder, TNODE_LEVEL(AboveTnode) + 1, StartDepth, AboveTnode, TRUE));
// Case 2: ThisLetter should be the first in the Para-List that already exists.
else if ( TNODE_LETTER(TNODE_CHILD(AboveTnode)) > ThisLetter ) {
Holder = TNODE_CHILD(AboveTnode);
// The "Holder" node will no longer be a direct child, so set it as such.
TNODE_SET_DIRECT_CHILD(Holder, FALSE);
TNODE_SET_CHILD(AboveTnode, TnodeInit(ThisLetter, Holder, WordEnder, TNODE_LEVEL(AboveTnode) + 1, StartDepth, AboveTnode, TRUE ));
// "Holder"'s "ParentalUnit" is now the new node, so make the necessary change.
TNODE_SET_PARENTAL_UNIT(Holder, TNODE_CHILD(AboveTnode));
}
// Case 3: The ParaList exists and ThisLetter is not first in the list. This is the default case condition: "if ( TNODE_LETTER(TNODE_CHILD(AboveTnode)) < ThisLetter )".
else {
Currently = TNODE_CHILD(AboveTnode);
while ( TNODE_NEXT(Currently) != NULL ) {
if ( TNODE_LETTER(TNODE_NEXT(Currently)) > ThisLetter ) break;
Currently = TNODE_NEXT(Currently);
}
Holder = TNODE_NEXT(Currently);
TNODE_SET_NEXT(Currently, TnodeInit(ThisLetter, Holder, WordEnder, TNODE_LEVEL(AboveTnode) + 1, StartDepth, Currently, FALSE));
if ( Holder != NULL ) TNODE_SET_PARENTAL_UNIT(Holder, TNODE_NEXT(Currently));
}
}
// This function returns "TRUE" if "FirstNode" and "SecondNode" are found to be the parent nodes of equal tree branches. This includes identical nodes in the current list.
// The "MaxChildDepth" of the two nodes can not be assumed equal due to the recursive nature of this function, so we must check for equivalence.
Bool TnodeAreWeTheSame(TnodePtr FirstNode, TnodePtr SecondNode){
if ( FirstNode == SecondNode ) return TRUE;
if ( FirstNode == NULL || SecondNode == NULL ) return FALSE;
if ( TNODE_LETTER(FirstNode) != TNODE_LETTER(SecondNode) ) return FALSE;
if ( TNODE_MAX_CHILD_DEPTH(FirstNode) != TNODE_MAX_CHILD_DEPTH(SecondNode) ) return FALSE;
if ( TNODE_NUMBER_OF_CHILDREN(FirstNode) != TNODE_NUMBER_OF_CHILDREN(SecondNode) ) return FALSE;
if ( TNODE_END_OF_WORD_FLAG(FirstNode) != TNODE_END_OF_WORD_FLAG(SecondNode) ) return FALSE;
if ( TnodeAreWeTheSame(TNODE_CHILD(FirstNode), TNODE_CHILD(SecondNode)) == FALSE ) return FALSE;
if ( TnodeAreWeTheSame(TNODE_NEXT(FirstNode), TNODE_NEXT(SecondNode)) == FALSE ) return FALSE;
else return TRUE;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// "Trie" functionality.
struct trie {
int NumberOfTotalWords;
int NumberOfTotalNodes;
TnodePtr First;
};
typedef struct trie Trie;
typedef Trie* TriePtr;
// Set up the "First" node in the Trie.
TriePtr TrieInit (void){
TriePtr Result;
Result = (TriePtr)malloc(sizeof(Trie));
Result->NumberOfTotalWords = 0;
Result->NumberOfTotalNodes = 0;
// The "First" node in the "Trie" will be a dummy node.
Result->First = TnodeInit('0', NULL, FALSE, 0, 0, NULL, FALSE);
return Result;
}
TnodePtr TrieRootTnode(TriePtr ThisTrie){
if ( ThisTrie->NumberOfTotalWords > 0 ) return TNODE_CHILD(ThisTrie->First);
else return NULL;
}
// This function simply makes "TrieAddWord" look a lot smaller. It returns the number of new nodes that it just inserted.
int TnodeAddWord(TnodePtr ParentNode, const char *Word){
int Result = 0;
int X, Y;
int WordLength = strlen(Word);
TnodePtr HangPoint = NULL;
TnodePtr Current = ParentNode;
// Try to follow the path of "Word" until it doesn't exist, and then hang a new chain of nodes to include it in the trie.
for ( X = 0; X < WordLength; X++ ) {
HangPoint = TnodeFindParaNode(TNODE_CHILD(Current), Word[X]);
if ( HangPoint == NULL ) {
TnodeInsertParaNode(Current, Word[X], ((X == (WordLength - 1))? TRUE: FALSE), WordLength - X - 1);
Result += 1;
Current = TnodeFindParaNode(TNODE_CHILD(Current), Word[X]);
for ( Y = X + 1; Y < WordLength; Y++ ) {
TnodeInsertParaNode(Current, Word[Y], ((Y == (WordLength - 1))? TRUE: FALSE), WordLength - Y - 1);
Result += 1;
Current = TNODE_CHILD(Current);
}
break;
}
else if ( TNODE_MAX_CHILD_DEPTH(HangPoint) < (WordLength - X - 1) ) TNODE_SET_MAX_CHILD_DEPTH(HangPoint, (WordLength - X - 1));
Current = HangPoint;
// The path for the word that we are trying to insert already exists, so just make sure that the end flag is flying on the last node. This should never happen if words are added in alphabetical order, but this is not a requirement of the program.
if ( X == WordLength - 1 ) TNODE_FLY_END_OF_WORD_FLAG(Current);
}
return Result;
}
// This function adds to "ThisTrie"'s counter variables and adds "NewWord", using "TnodeAddWord", where the real addition algorithm exists.
void TrieAddWord(TriePtr ThisTrie, char *NewWord){
ThisTrie->NumberOfTotalWords += 1;
ThisTrie->NumberOfTotalNodes += TnodeAddWord( ThisTrie->First, NewWord );
}
// This is a standard depth-first preorder tree traversal, whereby the objective is to count nodes of various "MaxChildDepth"s.
// The counting results are placed into the "Tabulator" array. This will be used to streamline the "Trie"-to-"Dawg" conversion process.
void TnodeGraphTabulateRecurse(TnodePtr ThisTnode, int Level, int* Tabulator){
if ( Level == 0 ) TnodeGraphTabulateRecurse(TNODE_CHILD(ThisTnode), Level + 1, Tabulator);
else if ( TNODE_DANGLING(ThisTnode) == FALSE ) {
Tabulator[TNODE_MAX_CHILD_DEPTH(ThisTnode)] += 1;
// Go Down if possible.
if ( TNODE_CHILD(ThisTnode) != NULL ) TnodeGraphTabulateRecurse(TNODE_CHILD(ThisTnode), Level + 1, Tabulator );
// Go Right through the Para-List if possible.
if ( TNODE_NEXT(ThisTnode) != NULL ) TnodeGraphTabulateRecurse(TNODE_NEXT(ThisTnode), Level, Tabulator );
}
}
// Count the "Tnode"'s in "ThisTrie" for each "MaxChildDepth". "Count" needs to be an integer array of size "MAX".
void TrieGraphTabulate(TriePtr ThisTrie, int* Count){
int *Numbers = (int*)malloc(MAX*sizeof(int));
int X;
for ( X = 0; X < MAX; X++ ) Numbers[X] = 0;
if ( ThisTrie->NumberOfTotalWords > 0) {
TnodeGraphTabulateRecurse(ThisTrie->First, 0, Numbers);
for ( X = 0; X < MAX; X++ ) {
Count[X] = Numbers[X];
}
}
free(Numbers);
}
// This function can never be called after a trie has been mowed because this will free pointers twice resulting in a core dump!
void FreeTnodeRecurse(TnodePtr ThisTnode){
if ( TNODE_CHILD(ThisTnode) != NULL ) FreeTnodeRecurse(TNODE_CHILD(ThisTnode)) ;
if ( TNODE_NEXT(ThisTnode) != NULL ) FreeTnodeRecurse(TNODE_NEXT(ThisTnode));
free(ThisTnode);
}
// An important function, it returns the first node in the list "MaxChildDepthWist", that is identical to "ThisTnode".
// If the function returns a value equal to "ThisTnode", then it is the first of its kind in the "Trie"
TnodePtr TnodeMexicanEquivalent( TnodePtr ThisTnode, TnodePtr ** MaxChildDepthWist ) {
int Tall = TNODE_MAX_CHILD_DEPTH(ThisTnode);
int X = 0;
while ( TnodeAreWeTheSame(ThisTnode, MaxChildDepthWist[Tall][X]) == FALSE ) {
X += 1;
}
return MaxChildDepthWist[Tall][X];
}
// Recursively replaces all redundant nodes in a trie with their first equivalent.
void TnodeLawnMowerRecurse(TnodePtr ThisTnode, TnodePtr ** HeightWits){
printf("\nTnodeLawnMowerRecurse\n");
if ( TNODE_LEVEL(ThisTnode) == 0 ) TnodeLawnMowerRecurse(TNODE_CHILD(ThisTnode), HeightWits);
else {
// When replacing a "Tnode", we must do so knowing that "ThisTnode" is how we got to it.
if ( TNODE_NEXT(ThisTnode) == NULL && TNODE_CHILD(ThisTnode) == NULL ) return;
if ( TNODE_CHILD(ThisTnode) != NULL ) {
// we have found a node that has been tagged for mowing, so let us destroy it, not literally, and replace it with its first equivelant in the "HeightWits" list, which won't be tagged.
if ( TNODE_DANGLING(TNODE_CHILD(ThisTnode)) == TRUE ) {
TNODE_SET_CHILD(ThisTnode, TnodeMexicanEquivalent(TNODE_CHILD(ThisTnode), HeightWits));
}
else TnodeLawnMowerRecurse( ThisTnode->Child, HeightWits );
}
// Traverse the rest of the "Trie", but a "Tnode" that is not a direct child will never be directly replaced.
// This will allow the resulting "Dawg" to fit into a contiguous array of node lists.
if ( TNODE_NEXT(ThisTnode) != NULL ) TnodeLawnMowerRecurse(TNODE_NEXT(ThisTnode), HeightWits);
}
}
// Replaces each redundant list in "ThisTrie" with its first equivalent.
void TrieLawnMower(TriePtr ThisTrie, TnodePtr ** HeightWise){
TnodeLawnMowerRecurse(ThisTrie->First, HeightWise );
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// A queue is required for breadth first traversal, and the rest is self evident.
struct breadthqueuenode {
TnodePtr Element;
struct breadthqueuenode *Next;
};
typedef struct breadthqueuenode BreadthQueueNode;
typedef BreadthQueueNode* BreadthQueueNodePtr;
// In the spirit of keeping the code fast, use macros for basic functionality.
#define BREADTH_QUEUE_NODE_NEXT(thisbreadthqueuenode) (thisbreadthqueuenode->Next)
#define BREADTH_QUEUE_NODE_ELEMENT(thisbreadthqueuenode) (thisbreadthqueuenode->Element);
#define BREADTH_QUEUE_NODE_SET_NEXT(thisbreadthqueuenode, nexit) (thisbreadthqueuenode->Next = nexit)
#define BREADTH_QUEUE_NODE_SET_ELEMENT(thisbreadthqueuenode, element) (thisbreadthqueuenode->Element = element)
BreadthQueueNodePtr BreadthQueueNodeInit(TnodePtr NewElement){
BreadthQueueNodePtr Result = (BreadthQueueNode*)malloc(sizeof(BreadthQueueNode));
Result->Element = NewElement;
Result->Next = NULL;
return Result;
}
struct breadthqueue {
BreadthQueueNodePtr Front;
BreadthQueueNodePtr Back;
int Size;
};
typedef struct breadthqueue BreadthQueue;
typedef BreadthQueue* BreadthQueuePtr;
BreadthQueuePtr BreadthQueueInit( void ) {
BreadthQueuePtr Result = (BreadthQueue*)malloc(sizeof(BreadthQueue));
Result->Front = NULL;
Result->Back = NULL;
Result->Size = 0;
}
void BreadthQueuePush(BreadthQueuePtr ThisBreadthQueue, TnodePtr NewElemental){
BreadthQueueNodePtr Noob = BreadthQueueNodeInit(NewElemental);
if ( (ThisBreadthQueue->Back) != NULL ) BREADTH_QUEUE_NODE_SET_NEXT(ThisBreadthQueue->Back, Noob);
else ThisBreadthQueue->Front = Noob;
ThisBreadthQueue->Back = Noob;
ThisBreadthQueue->Size += 1;
}
TnodePtr BreadthQueuePop(BreadthQueuePtr ThisBreadthQueue){
TnodePtr Result;
if ( ThisBreadthQueue->Size == 0 ) return NULL;
if ( ThisBreadthQueue->Size == 1 ) {
ThisBreadthQueue->Back = NULL;
ThisBreadthQueue->Size = 0;
Result = BREADTH_QUEUE_NODE_ELEMENT(ThisBreadthQueue->Front);
free(ThisBreadthQueue->Front);
ThisBreadthQueue->Front = NULL;
return Result;
}
Result = BREADTH_QUEUE_NODE_ELEMENT(ThisBreadthQueue->Front);
BreadthQueueNodePtr Holder = ThisBreadthQueue->Front;
ThisBreadthQueue->Front = BREADTH_QUEUE_NODE_NEXT(ThisBreadthQueue->Front);
free(Holder);
ThisBreadthQueue->Size -= 1;
return Result;
}
void BreadthQueuePopulateReductionArray(BreadthQueuePtr ThisBreadthQueue, TnodePtr Root, TnodePtr **Holder){
printf( "Inside external BreadthQueuePopulateReductionArray function.\n" );
int Taker[MAX];
int X = 0;
int PopCount = 0;
int CurrentMaxChildDepth = 0;
TnodePtr Current = Root;
for ( X = 0; X < MAX; X++ ) Taker[X] = 0;
// Push the first row onto the queue.
while ( Current != NULL ) {
BreadthQueuePush(ThisBreadthQueue, Current);
Current = TNODE_NEXT(Current);
}
// Initiate the pop-followed-by-push-all-children loop.
while ( (ThisBreadthQueue->Size) != 0 ) {
Current = BreadthQueuePop(ThisBreadthQueue);
PopCount += 1;
CurrentMaxChildDepth = TNODE_MAX_CHILD_DEPTH(Current);
Holder[CurrentMaxChildDepth][Taker[CurrentMaxChildDepth]] = Current;
Taker[CurrentMaxChildDepth] += 1;
Current = TNODE_CHILD(Current);
while ( Current != NULL ) {
BreadthQueuePush(ThisBreadthQueue, Current);
Current = TNODE_NEXT(Current);
}
}
printf( "Completed Populating The Reduction Array.\n" );
}
// This function will label all of the remaining nodes in the Trie-Turned-Dawg so that they will fit contiguously into an unsigned integer array.
// The return value is the final index number handed out to the "Tnode"'s, and hence one less than the size of the array that they need to fit into.
int BreadthQueueUseToIndex(BreadthQueuePtr ThisBreadthQueue, TnodePtr Root){
// The first index to be given out will be "1" because "0" denotes "NULL".
int IndexNow = 0;
TnodePtr Current = Root;
// Push the first Para-List onto the queue.
while ( Current != NULL ) {
BreadthQueuePush(ThisBreadthQueue, Current);
Current = TNODE_NEXT(Current);
}
// Pop each element off of the queue and only push its children if has not been dangled yet.
// Assign an index to the node, if one has not been given to it yet. Nodes will be placed on the queue many times.
while ( (ThisBreadthQueue->Size) != 0 ) {
Current = BreadthQueuePop(ThisBreadthQueue);
// If the "Current" node already has an index, don't give it a new one.
if ( TNODE_ARRAY_INDEX(Current) == 0 ) {
IndexNow += 1;
TNODE_SET_ARRAY_INDEX(Current, IndexNow);
Current = TNODE_CHILD(Current);
while ( Current != NULL ) {
BreadthQueuePush(ThisBreadthQueue, Current);
Current = TNODE_NEXT(Current);
}
}
}
printf( "Completed Assigning Indices To Living Nodes.\n" );
return IndexNow;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// This section contains code related to the format of unsigned integers that represent "Dawg" nodes.
// The "BinaryNode" string must be at least 32 + 5 + 1 bytes in length. Space for the bits, the seperation pipes, and the end of string char.
void ConvertUnsignedIntNodeToBinaryString(unsigned int TheNode, char* BinaryNode){
int X;
int Bit;
BinaryNode[0] = '|';
// Bit 31, the leftmost bit, represents the "EndOfWordFlag".
BinaryNode[1] = (TheNode & PowersOfTwo[31])?'1':'0';
BinaryNode[2] = '|';
// Bit 30 represents the "EndOfListFlag".
BinaryNode[3] = (TheNode & PowersOfTwo[30])?'1':'0';
BinaryNode[4] = '|';
// Bit 29 to bit 25 are the "Letter" bits.
BinaryNode[5] = (TheNode & PowersOfTwo[29])?'1':'0';
BinaryNode[6] = (TheNode & PowersOfTwo[28])?'1':'0';
BinaryNode[7] = (TheNode & PowersOfTwo[27])?'1':'0';
BinaryNode[8] = (TheNode & PowersOfTwo[26])?'1':'0';
BinaryNode[9] = (TheNode & PowersOfTwo[25])?'1':'0';
BinaryNode[10] = '|';
// Bit 24 to bit 0, are space of the first child index.
Bit = 24;
for ( X = 11; X <= 35; X++ ) {
BinaryNode[X] = (TheNode & PowersOfTwo[Bit])? '1': '0';
Bit -= 1;
}
BinaryNode[36] = '|';
BinaryNode[37] = '\0';
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// This function is the core of the "Dawg" creation procedure. Pay close attention to the order of the steps involved.
unsigned int *DawgInit(char **Dictionary, int *SegmentLenghts, int MaxStringLength, int MinStringLength){
unsigned int *Result;
int X = 0;
int Y;
int *ChildCount;
char *ChildStrings;
TriePtr TemporaryTrie;
int *NodeNumberCounter;
int *NodeNumberCounterInit;
int TotalNodeSum = 0;
TnodePtr ** HolderOfAllTnodePointers;
BreadthQueuePtr Populator;
int NumberDangled;
int TotalDangled = 0;
int W;
int DangleCount[MaxStringLength];
int NumberOfLivingNodes;
int TotalDangledCheck = 0;
BreadthQueuePtr OrderMatters;
int IndexCount;
int IndexFollow;
int IndexFollower = 0;
unsigned int UnsignedEncodingValue;
FILE *Text;
FILE *Data;
char TheNodeInBinary[BINARY_STRING_LENGTH];
printf("\nStep 0 - Get ready to watch the Dawg creation procedure.\n");
while ( SegmentLenghts[X] == 0 ) X += 1;
printf("\nStep 1 - Create the intermediate Trie and begin filling it with the provided lexicon.\n");
// Create a Temp Trie structure and then feed in the given lexicon.
TemporaryTrie = TrieInit();
for ( X = MinStringLength; X <= MaxStringLength; X++ ) {
for ( Y = 0; Y < SegmentLenghts[X]; Y++ ) {
TrieAddWord(TemporaryTrie, &(Dictionary[X][(X + 1)*Y]));
}
}
printf("\nStep 2 - Finished adding words to the temporary Trie, so set up the space to sort the Tnodes by MaxChildDepth.\n");
printf("\nStep 2 - Finished adding words to the temporary Trie, so set up the space to sort the Tnodes by MaxChildDepth.\n");
// Create the associated pointer array with all the nodes inside it.
NodeNumberCounter = (int*)malloc(MaxStringLength*sizeof(int) );
for ( X = 0; X < MaxStringLength; X++ ) NodeNumberCounter[X] = 0;
NodeNumberCounterInit = (int*)malloc(MaxStringLength*sizeof(int));
printf("\nStep 3 - Count the total number of nodes in the raw Trie by MaxChildDepth.\n");
TrieGraphTabulate(TemporaryTrie, NodeNumberCounter);
printf("\nStep 4 - Initial counting of Trie nodes complete, so display the results.\n\n");
for ( X = 0; X < MaxStringLength; X++ ) {
NodeNumberCounterInit[X] = NodeNumberCounter[X];
TotalNodeSum += NodeNumberCounter[X];
}
for ( X = 0; X < MaxStringLength; X++ ) {
printf("Initial Count For MaxChildDepth |%2d| is |%6d| nodes\n", X, NodeNumberCounterInit[X]);
}
printf("\nThe Total Number Of Nodes In The Trie = |%d| \n", TotalNodeSum);
// We will have exactly enough space for all of the Tnode pointers.
printf("\nStep 5 - Allocate a 2 dimensional array of Tnode pointers to minimize the trie into a Dawg.\n");
HolderOfAllTnodePointers = (TnodePtr **)malloc(MaxStringLength*sizeof(TnodePtr*));
for ( X = 0; X < MAX; X++ ) HolderOfAllTnodePointers[X] = (TnodePtr *)malloc(NodeNumberCounterInit[X]*sizeof(TnodePtr));
// When populating the "HolderOfAllTnodePointers", we are going to do so in a breadth first manner to ensure that the next "Tnode" in a list located at the next array index.
printf("\nStep 6 - Populate the 2 dimensional array of Trie node pointers.\n");
Populator = BreadthQueueInit();
BreadthQueuePopulateReductionArray(Populator, TrieRootTnode(TemporaryTrie), HolderOfAllTnodePointers );
printf("\nStep 7 - Population complete.\n");
// Flag all of the reduntant nodes.
// Flagging requires the node comparison function that will take a very long time for a big dictionary.
// This is especially true when comparing the nodes with small "MaxChildDepth"'s because there are so many of them.
// It is faster to start with nodes of the largest "MaxChildDepth" to recursively reduce as many lower nodes as possible.
printf("\nStep 8 - Begin to tag redundant nodes as dangled - Use recursion because only direct children are considered for elimination to keep the remaining lists in tact.\n");
// Start at the largest "MaxChildDepth" and work down from there for recursive reduction to take place early on to reduce the work load for the shallow nodes.
for ( Y = (MaxStringLength - 1); Y >= 0 ; Y--) {
NumberDangled = 0;
// Move through the holder array from the beginning, looking for any nodes that have not been dangled, these will be the surviving nodes.
for ( W = 0; W < (NodeNumberCounterInit[Y] - 1); W++ ) {
// The Node is already Dangling. Note that this node need not be the first in a child list, it could have been dangled recursively.
// In order to eliminate the need for the "Next" index, the nodes at the root of elimination must be the first in a list, in other words, a "DirectChild".
// The node that we replace the "DirectChild" node with can be located at any position.
if ( TNODE_DANGLING(HolderOfAllTnodePointers[Y][W]) == TRUE ) continue;
// Traverse the rest of the array looking for equivalent nodes that are both not dangled and are tagged as direct children.
// When we have found an identical list structure further on in the array, dangle it, and all the nodes coming after, and below it.
for ( X = W + 1; X < NodeNumberCounterInit[Y]; X++ ) {
if ( TNODE_DANGLING(HolderOfAllTnodePointers[Y][X]) == FALSE && TNODE_DIRECT_CHILD(HolderOfAllTnodePointers[Y][X]) == TRUE ) {
if ( TnodeAreWeTheSame(HolderOfAllTnodePointers[Y][W], HolderOfAllTnodePointers[Y][X]) == TRUE ) {
NumberDangled += TnodeDangle(HolderOfAllTnodePointers[Y][X]);
}
}
}
}
printf("Dangled |%5d| Nodes In all, through recursion, for MaxChildDepth of |%2d|\n", NumberDangled, Y);
DangleCount[Y] = NumberDangled;
TotalDangled += NumberDangled;
}
printf("\nTotal Number Of Dangled Nodes |%d|\n", TotalDangled);
NumberOfLivingNodes = TotalNodeSum - TotalDangled;
printf("\nTotal Number Of Living Nodes |%d|\n", NumberOfLivingNodes);
printf("\nStep 9 - Count the number of living nodes in the trie before replacement so that we check our numbers.\n");
// By running the graph tabulation function on a different array, and before we replace the nodes, we can determine if our numbers are correctish.
TrieGraphTabulate(TemporaryTrie, NodeNumberCounter);
for ( X = 0; X < MaxStringLength; X++ ) {
printf( "Count for living nodes of MaxChildDepth |%2d| is |%5d|. It used to be |%6d| and so the number dangled is |%6d| \n", X, NodeNumberCounter[X], NodeNumberCounterInit[X], NodeNumberCounterInit[X] - NodeNumberCounter[X] );
}
for ( X = 0; X < MAX; X++ ) {
TotalDangledCheck += (NodeNumberCounterInit[X] - NodeNumberCounter[X]);
}
if ( TotalDangled == TotalDangledCheck ) printf("The total number of nodes dangled adds up.\n");
else printf("Something went terribly wrong, so fix it.\n");
printf("\nStep 10 - Dangling is complete, so replace all dangled nodes with their first mexican equivelant in the Trie to make a compressed Dawg.\n");
// Node replacement has to take place before indices are set up so nothing points to redundant nodes. - This step is absolutely critical. Mow The Lawn so to speak! Then assign indicies.
TrieLawnMower( TemporaryTrie, HolderOfAllTnodePointers );
printf("\nStep 11.1 - Mowing of the lawn is now complete, so start to assign array indices to all living nodes.\n");
printf("Step 11.2 - The use of a breadth first queue during this step ensures that lists of contiguous nodes in the array will eliminate the need for a Next pointer.\n\n");
OrderMatters = BreadthQueueInit();
// Try to find out if the nodes we are setting are unique before we set them.
IndexCount = BreadthQueueUseToIndex( OrderMatters, HolderOfAllTnodePointers[MAX - 1][0] );
printf("Finished indexing.\n");
printf("NumberOfLivingNodes from after the dangling process|%d|\n", NumberOfLivingNodes);
printf("IndexCount from the index-handing-out breadth first traversal |%d|\n", IndexCount);
if ( NumberOfLivingNodes == IndexCount ) printf("The numbers add up properly once again.\n");
else {
printf("The Numbers got Scrooged, so you still have some problems to iron out.\n");
return NULL;
}
// Allocate the space needed to store the "Dawg" inside of an array.
Result = (unsigned int*)calloc((NumberOfLivingNodes + 1), sizeof(unsigned int));
// Roll through the pointer arrays and use the correct "BIT_SHIFT" values to encode the proper unsigned ints.
// Set the "NULL" entry at the beginning of the array.
Result[0] = 0;
printf("\nStep 12 - Populate the unsigned integer array with a bitwise encoding.\n");
// Traverse the entire 2D pointer array and search for undangled nodes. When an undangled node is found, encode it, and place it at position "ArrayIndex".
for ( X = (MaxStringLength - 1); X >= 0; X-- ) {
for (W = 0; W < NodeNumberCounterInit[X]; W++ ){
if ( TNODE_DANGLING(HolderOfAllTnodePointers[X][W]) == FALSE ) {
UnsignedEncodingValue = TNODE_LETTER(HolderOfAllTnodePointers[X][W]) - 'A';
UnsignedEncodingValue <<= LETTER_BIT_SHIFT;
UnsignedEncodingValue += (TNODE_END_OF_WORD_FLAG(HolderOfAllTnodePointers[X][W]) == FALSE)? 0: END_OF_WORD_BIT_MASK;
UnsignedEncodingValue += (TNODE_NEXT(HolderOfAllTnodePointers[X][W]) == NULL)? END_OF_LIST_BIT_MASK: 0;
UnsignedEncodingValue += (TNODE_CHILD(HolderOfAllTnodePointers[X][W]) == NULL)? 0: TNODE_ARRAY_INDEX(TNODE_CHILD(HolderOfAllTnodePointers[X][W]));
IndexFollow = TNODE_ARRAY_INDEX(HolderOfAllTnodePointers[X][W]);
if ( IndexFollow > IndexFollower ) IndexFollower = IndexFollow;
Result[IndexFollow] = UnsignedEncodingValue;
}
}
}
printf( "IndexFollower, which is the largest index assigned in the array = |%d|\n", IndexFollower );
printf( "NumberOfLivingNodes|%d|, assert that these two are equal because they must be.\n", NumberOfLivingNodes );
if ( IndexFollower == NumberOfLivingNodes ) printf("The numbers add up again, excellent!\n");
else {
printf("Don't jump! You are very close to getting this program working.\n");
return NULL;
}
// Do Garbage cleanup and free the whole Trie, which is no longer needed. Free all nodes from the holding array.
for ( X = 0; X < MaxStringLength; X++ ) for ( W = 0; W < NodeNumberCounterInit[X]; W++ ) free(HolderOfAllTnodePointers[X][W]);
free(TemporaryTrie);
free(NodeNumberCounter);
free(NodeNumberCounterInit);
for ( X = 0; X < MaxStringLength; X++ ) if (HolderOfAllTnodePointers[X]!= NULL) free(HolderOfAllTnodePointers[X]);
free(HolderOfAllTnodePointers);
printf("\nStep 13 - Creation of traditional Dawg is complete, so store it into a text file for verification and a 32-bit binary file for use.\n");
// We now need to include the "NULL" node at position "0" in the living node collection.
NumberOfLivingNodes += 1;
Text = fopen(DAWG_TEXT_DATA,"w");
fprintf(Text, "Total number of Dawg nodes = |%d|, including \"0 = NULL\".\n\n", NumberOfLivingNodes);
for ( X = 1; X < NumberOfLivingNodes; X++ ) {
ConvertUnsignedIntNodeToBinaryString(Result[X], TheNodeInBinary);
fprintf(Text, "Node|%6d|-Letter|%c|-EOW|%3s|-EOL|%3s|-Child|%6d| - Binary%s\n", X, ((Result[X]&LETTER_BIT_MASK)>>LETTER_BIT_SHIFT) + 'A', (Result[X]&END_OF_WORD_BIT_MASK)? "yes": "no", (Result[X]&END_OF_LIST_BIT_MASK)? "yes": "no", Result[X]&CHILD_INDEX_BIT_MASK, TheNodeInBinary);
}
//printf("Beep\n");
fclose(Text);
printf("Out of 32 bit traditional text output to file clean.\n");
Data = fopen(DAWG_DATA,"wb");
// It is critical, especially in a binary file, that the first integer written to the file be the number of nodes stored in the file.
// Simply write the entire unsigned integer array "Result" into the data file.
fwrite(&NumberOfLivingNodes, sizeof(int), 1, Data);
fwrite(Result, sizeof(unsigned int), NumberOfLivingNodes, Data);
fclose(Data);
printf("Out of 32 bit traditional data output to file clean.\n");
return Result;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The "Main()" function just runs the show.
int main() {
int X = 0;
// All of the words of similar length will be stored sequentially in the same array so that there will be (MAX + 1) arrays in total. The Smallest length of a string is assumed to be 2.
char *AllWordsInEnglish[MAX + 1];
FILE *Input;
char ThisLine[INPUT_LIMIT];
unsigned int FirstLineIsSize;
int LineLength;
int KeepTracker[MAX + 1];
int CurrentTracker[MAX + 1];
int DictionarySizeIndex[MAX + 1];
int CurrentLength = 0;
unsigned int *TheLexiconDawg;
unsigned int Testing;
char BinaryStringTester[BINARY_STRING_LENGTH];
printf("The main function lives\n");
for ( X = 0; X <= MAX; X++ ) AllWordsInEnglish[X] = NULL;
Input = fopen(RAW_LEXICON,"r");
fgets(ThisLine, INPUT_LIMIT, Input);
// Correction is needed to get rid of the new line character that "fgets()" appends to the string.
CUT_OFF_NEW_LINE(ThisLine);
LineLength = strlen(ThisLine);
printf("ThisLine is |%s|\n", ThisLine);
FirstLineIsSize = StringToUnsignedInt(ThisLine);
printf("FirstLineIsSize is |%u|\n", FirstLineIsSize);
// Read the memory straight into ram using dynamically allocated space to count the words of each length.
for ( X = 0; X <= MAX; X++ ) KeepTracker[X] = 0;
char **LexiconInRam = (char**)malloc(FirstLineIsSize*sizeof(char*));
// The first line gives us the number of words so read in all of them into ram temporarily.
for ( X = 0; X < FirstLineIsSize; X++ ) {
fgets(ThisLine, INPUT_LIMIT, Input);
CUT_OFF_NEW_LINE(ThisLine);
MakeMeAllCapital(ThisLine);
LineLength = strlen( ThisLine );
if ( LineLength <= MAX ) KeepTracker[LineLength] += 1;
LexiconInRam[X] = (char*)malloc((LineLength + 1)*sizeof(char));
strcpy(LexiconInRam[X], ThisLine);
}
printf("Lexicon.txt read is now complete\n\n");
for ( X = 0; X <= MAX; X++ ) printf("There are |%5d| words of length |%2d|\n", KeepTracker[X], X);
// Allocate enough space to hold all of the words in strings so that we can add them to the trie by length.
for ( X = MIN; X <= MAX; X++ ) AllWordsInEnglish[X] = (char*)malloc((X + 1)*KeepTracker[X]*sizeof(char));
printf("\nInitial malloc() complete\n");
for ( X = 0; X <= MAX; X++ ) CurrentTracker[X] = 0;
// Copy all of the strings into "AllWordsInEnglish".
for ( X = 0; X < FirstLineIsSize; X++ ) {
CurrentLength = strlen(LexiconInRam[X]);
// As convoluted as this command may appear, it simply copies a string from its temporary ram location to the array of length equivelant strings for adding to the intermediate "Trie".
if ( CurrentLength <= MAX ) strcpy(&((AllWordsInEnglish[CurrentLength])[(CurrentTracker[CurrentLength]*(CurrentLength + 1))]), LexiconInRam[X]);
CurrentTracker[CurrentLength] += 1;
}
printf("The words have now been sorted by length\n");
// Make sure that the counting has resulted in all of the strings being placed correctly.
for ( X = 0; X < (MAX + 1); X++ ) {
if ( KeepTracker[X] != CurrentTracker[X] ) {
printf("The number of words is not adding up properly, so fix it.\n");
return 0;
}
}
// Free the initial ram read space
for ( X = 0; X < FirstLineIsSize; X++ ) free(LexiconInRam[X]);
free(LexiconInRam);
for ( X = 0; X <= MAX; X++ ) DictionarySizeIndex[X] = KeepTracker[X];
printf( "The Dawg init function will now be run, so be patient, it will take some time to complete.\n" );
TheLexiconDawg = DawgInit(AllWordsInEnglish, DictionarySizeIndex, MAX, MIN);
// Free up the array that holds the uncompressed English language.
for ( X = 0; X <= MAX; X++ ) if ( AllWordsInEnglish[X] != NULL ) free(AllWordsInEnglish[X]);
return 0;
}