-
Notifications
You must be signed in to change notification settings - Fork 0
/
Titlecase.pm
180 lines (159 loc) · 5.54 KB
/
Titlecase.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/perl
use 5.10.1;
#use warnings;
use strict;
use utf8;
binmode STDOUT, ":encoding(UTF-8)";
use warnings FATAL => qw(uninitialized);
# use Data::Dumper::Simple;
use Tie::File::AsHash;
package Titlecase;
use Exporter;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
our $VERSION = 1;
our @ISA = qw(Exporter);
our @EXPORT = ();
our @EXPORT_OK = qw(titlecase isanacronym ucfirstimproved possibleacronym);
#our %EXPORT_TAGS = ( DEFAULT => [qw(&titlecase &isanacronym &ucfirstimproved &possibleacronym) ] );
#lower cases all and then ignoring starting punctuation, uppercase the first word of the string of which each word has been filtered for words that should not be titlecased and then titlecasesd using ucfirstimproved (which ignores initial punctuation in order to correctly punctuate quoted strings). Removes multiple spaces too for good measure.
#English only!
my $acronym_page="User:HastyBot/Acronyms";
use HastyBot qw(getpage);
my $text=getpage($acronym_page); # HastyBot performs log in for us...
my ($one, $two);
die "No Acronyms found - error on page [[$acronym_page]]." if $text eq "";
my %acr;
foreach (split /\n/, $text) {
next if $_ !~ /^\*/;
s/\*//; # sanitise to alphanum and bar.
s/[^a-z0-9\|]//i;
($one, $two) = split /\|/;
$one=lc($one);
warn "Acronymn $one already defined!" if defined $acr{$one};
$acr{$one}=$two;
};
sub titlecase {
my ($s) = @_;
$s =~ s/<br>//ig; # corner case - cant see why it should even be there...
return $s if $s=~ m/.*\[+.*\]+.*/; #if one or more square brackets then give up
return "" if $s eq ""; # sanity for null strings
$s =~ s/\s\&(amp;)?\s/ and /g;# get rid of horrid ampersands...
my $ts = substr($s,-1)eq " " ? " " : ""; # if trailing space then keep it (hack)
#say "$ts|ping";
$s = join " ", map { _titlecasemangler($_) } split /\s+/, $s;
$s = ($s=~ m/^[^a-z]*(\w+)/i && isacronym($1)) ? $s.$ts : ucfirstimproved( $s.$ts ) ;
# two word alternations
$s =~ s/(Appendix )([a-z]|[IVXivx]+)/$1.uc($2)/ge; # get Appendix A or I, II, VI etc
$s =~ s/([IVXivx]+)([\.:]\s)/uc($1).$2/ge; # get Appendix A or I, II, VI etc
# corner cases...
$s =~ s/wise school/WISE School/ig; # corner case
$s =~ s/\.\s+it/. It/g; # for some reason there are sentences in headings... this is a corner case
$s =~ s/\.\s+the/. The/g; # for some reason there are sentences in headings... this is a corner case
$s =~ s/Call2All/Call2All/ig; # corner case
$s =~ s/go manual/GO Manual/ig; # corner case
$s =~ s/\ / /ig; # corner case
$s =~ s/a\/v/A\/V/ig; # corner case
return $s;
};
sub _titlecasemangler {
my ($s) = @_;
# if a compound word joined with / or - then split mangle on both bits...
return _titlecasemangler2($s) if $s !~ m/\b[-\/]\b/;
return join '/', map {_titlecasemangler2($_)} split '/', $s if $s =~ m/\//;
return join '-', map {_titlecasemangler2($_)} split '-', $s if $s =~ m/-/;
return $s;
#$s =~ m/(\w+)([\/-])(.*)/;
#return _titlecasemangler2($1).$2._titlecasemangler2($3);
}
sub _titlecasemangler2 {
my ($s) = @_;
return lc($s) if $s=~ m/^\d+(st|nd|rd|th)[^a-z]*$/i ; # ignore 1st 2nd, 3rd, 4th etc.
return lc($s) if $s=~ m/^[^a-z]*(a(nd?|s|t|m)*|b(ut|y)|do|en|for|i[fnst]|o[fnr]|t[he|o]*|vs?\.?|via|etc|e\.g)[\,\."':;]*$/i; # return lowercased ignore words
# handle acronyms well - via a hash
my $acro=isacronym($s);
return $acro if $acro;# if there is an acronym correct and return
return ucfirstimproved(lc($s)); #so now we title case it!
};
sub isacronym {
# my %acr = (
# http => 'http',
# https => 'https',
# ftp => 'ftp',
# mailto => 'mailto',
# html => 'HTML',
# rss => 'RSS',
# css => 'CSS',
# cms => 'CMS',
# php => 'PHP',
# pr => 'PR',
# welc => 'WELC',
# welt => 'WELT',
# ywam => 'YWAM',
# ywamer => 'YWAMer',
# ywamers => 'YWAMers',
# dts => 'DTS',
# cdts => 'CDTS',
# bls => 'BLS',
# soe => 'SOE',
# sofm => 'SOFM',
# spld => 'SPLD',
# sbcw => 'SBCW',
# uofn => 'UofN',
# ywamkb => 'YWAMKB',
# kb => 'KB',
# isbn => 'ISBN',
# pdf => 'PDF',
# su => 'su',
# sudo => 'sudo',
# pdfs => 'PDFs',
# uk => 'UK',
# hiv => 'HIV',
# aids => 'AIDS',
# dfh => 'DFH',
# df1 => 'DF1',
# awol => 'AWOL',
# glt => 'GLT',
# nlt => 'NLT',
# blt => 'BLT',
# elf => 'ELF',
# xml => 'XML',
# crit => 'CRIT',
# dna => 'DNA',
# agm => 'AGM',
# swot => 'SWOT',
# faq => 'FAQ',
# kbian => 'KBian',
# xxx => 'XXX',
# xp => 'XP',
# diy => 'DIY',
# dvd => 'DVD',
# # call2all => 'Call2All',
# url => 'URL',
# mgm => 'MGM',
# itunes => 'iTunes',
# iphone => 'iPhone',
# ipod => 'iPod',
# knowledgebase => 'KnowledgeBase',
# ywamknowledgebase => 'YWAMKnowledgeBase',
# );
my ($key) = @_;
$key=~ m/([^a-z]*)([a-z]*)(.*)/i; #place before, middle and after into search variables
$key=lc($2);
if (!defined $acr{$key} && $2 eq uc($key) && length($2)>2 ) {possibleacronym($2)};
return 0 if !defined $acr{$key}; #if there is no match then return
return $1.$acr{$key}.lc($3) ; #send back reassembled acronym
};
{ my %possacro; #declare static
sub possibleacronym {
foreach my $key (@_) {
$possacro{$key}="seen";
}
return %possacro;
};
};
sub ucfirstimproved {
my ($s)= @_;
$s=~ s/([^a-z]*)([a-z])(.*)/$1.uc($2).$3/gei; #(globally match, eval and insensitive search)
return $s;
};
1;