forked from benbalter/Frequency-Analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrequency-analysis.php
154 lines (137 loc) · 4.03 KB
/
frequency-analysis.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
<?php
//Title of Analysis
$doctitle = '';
//What analysis are we doing
// (int # of words to analyze per phrase) => (string label for that phrase)
$analysis = array(
1=>"Single-Word Analysis",
2=>"Two-Word Phrase Analysis",
3=>"Three-Word Phrase Analysis",
4=>"Four-Word Phrase Analysis",
5=>"Five-Word Phrase Analysis"
);
//How many result per analysis
$limit = 50;
//grab file contents
$content = file_get_contents( 'input.txt' );
//if the file doesn't exist, error out
if ( !$content )
die( 'Please place your source text in "input.txt" in the same directory as this file' );
//strip out bad charecters, just the words, ma'am
$content = preg_replace( "/(,|\"|\.|\?|:|!|;| - )/", " ", $content );
$content = preg_replace( "/\n/", " ", $content );
$content = preg_replace( "/\s\s+/", " ", $content );
//split content on words
$content = split(" ",$content);
$words = Array();
/**
* Parses text and builds array of phrase statistics
*
* @param string $input source text
* @param int $num number of words in phrase to look for
* @rerturn array array of phrases and counts
*/
function build_stats($input,$num) {
//init array
$results = array();
//loop through words
foreach ($input as $key=>$word) {
$phrase = '';
//look for every n-word pattern and tally counts in array
for ($i=0;$i<$num;$i++) {
if ($i!=0) $phrase .= ' ';
$phrase .= strtolower( $input[$key+$i] );
}
if ( !isset( $results[$phrase] ) )
$results[$phrase] = 1;
else
$results[$phrase]++;
}
if ($num == 1) {
//clean boring words
$a = split(" ","the of and to a in that it is was i for on you he be with as by at have are this not but had his they from she which or we an there her were one do been all their has would will what if can when so my");
foreach ($a as $banned) unset($results[$banned]);
}
//sort, clean, return
array_multisort($results, SORT_DESC);
unset($results[""]);
return $results;
}
/**
* Formats output
*
* @param array $stats results from build_stats
* @param string $name name of this test group
*
*/
function print_stats($stats,$name) {
global $limit;
?>
<div class='analysis'>
<h2 id='<?php echo strtolower(str_replace(' ','-',$name)); ?>'><?php echo $name; ?></h2>
<table border=1>
<tr>
<th>Rank</th>
<th>Term(s)</th>
<th>Frequency</th>
</tr>
<?php
$i=1;
foreach ($stats as $term => $count) {
if ($count == 1) continue;
if ($i > $limit) break;
echo "
<tr>
<td>$i</td>
<td>$term</td>
<td>$count</td>
</tr>";
$i++;
}
?>
</table>
</div>
<?php } ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title><?php echo $doctitle; ?> Frequency Analysis</title>
<style>
.toc {top:0px;right:0px;position:fixed; padding: 10px 20px 10px 20px; height:100%; border-left:1px solid black; background: #ddd; width:20%;}
.analysis {float:left; width:40%; text-align: center; padding:10px;}
.analysis table {margin-left:auto; margin-right:auto}
.container {width: 80%}
h1,h2,h2 {text-align:center}
</style>
</head>
<body>
<div class='toc'>
<h3><?php echo $doctitle; ?> Frequency Analysis</h3>
<ul>
<?php
foreach ($analysis as $id=>$title) { ?>
<li><a href='#<?php echo strtolower(str_replace(' ','-',$title)); ?>'><?php echo $title; ?></a></li>
<?php }
?>
<li><a href='#overall'>Overall</a></li>
</ul>
</div>
<div class='container'>
<h1><?php echo $doctitle; ?> Frequency Analysis</h1>
<?php
//init array
$overall = array();
//loop through each analysis group and run our test
foreach ($analysis as $id=>$title) {
$stats = build_stats($content,$id);
$overall = array_merge($overall,$stats);
print_stats($stats,$title);
}
//sort and print overall stats
array_multisort($overall, SORT_DESC);
print_stats($overall,"Overall");
?>
</div>
</body>
</html>