-
Notifications
You must be signed in to change notification settings - Fork 1
/
crowdsum.sql
1276 lines (1236 loc) · 187 KB
/
crowdsum.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
-- phpMyAdmin SQL Dump
-- version 4.1.7
-- http://www.phpmyadmin.net
--
-- Host: localhost
-- Generation Time: Apr 01, 2014 at 10:16 PM
-- Server version: 5.6.16-log
-- PHP Version: 5.4.25
SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
SET time_zone = "+00:00";
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8 */;
--
-- Database: `crowdsum`
--
-- --------------------------------------------------------
--
-- Table structure for table `documents`
--
CREATE TABLE IF NOT EXISTS `documents` (
`author` text,
`title` text,
`keywords` text,
`contributions` int(11) DEFAULT NULL,
`created` datetime DEFAULT NULL,
`fulltext` longtext,
`id` int(11) NOT NULL AUTO_INCREMENT,
`modified` datetime DEFAULT NULL,
`publication` int(11) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=84 ;
--
-- Dumping data for table `documents`
--
INSERT INTO `documents` (`author`, `title`, `keywords`, `contributions`, `created`, `fulltext`, `id`, `modified`, `publication`) VALUES
('Wired', 'Smartwatch', NULL, 1, '2014-03-31 11:04:32', 'Last week, Google unveiled its vision of the smartwatch, the elusive Next Big Gadget. It takes the form of Android Wear, a new version of the mobile operating system designed specifically for on-the-body devices. It’s a good deal more sophisticated than the smartwatches we’ve seen hitherto, relying on the company’s unparalleled voice recognition for registering simple commands and promising to serve up the “info and suggestions you need, right when you need them” thanks to the same predictive, personalized algorithms that power Google Now on Android phones.\r\n\r\nAmidst speculation that Apple’s long-fabled iWatch might in fact be a health-specific wristband, Android Wear is clearly aiming for something much bigger. And that makes sense. If there’s any company today that has a chance to make the multipurpose smartwatch we’ve all been dreaming of, it’s Google. But it’s not just heaps of data and algorithmic might that make Android Wear promising. It’s also Google’s approach to the endeavor–its willingness to let third-party developers deeper into the stack and, potentially, to let users define the experience for themselves–that could help make it a hit.\r\n\r\nContext Is King\r\n\r\nContext is the holy grail of wearable devices. With the limited real estate of a watch face, knowing what app, service, prompt or data point a person needs at a specific moment becomes paramount. The shiny promotional videos Google released this week show how context plays out in Android Wear in a number of situations. On the bus, your smartwatch might show you the next few stops; if there’s a meeting coming up, it’ll remind you who it’s with, and offer directions for how to get there. The video suggests a few less obvious use cases, too. If your Android Wear watch feels itself shaking around and its microphone hears music, it might figure out that you’re dancing, and tell you what song’s playing.\r\n\r\nBut context isn’t just about using sensors to intuit your environment and activity. It’s also about tying your scattered digital existence to your actual, physical self. It’s about looking at your calendar, your inbox, and your contacts in concert, cross-referencing them, and coming away with a more human understanding of your schedule, your to-do list, and your circle of friends. When it was released in 2004, Gmail did away with the hassle of organizing email by letting you search through your inbox. At its best, a contextually-savvy operating system like Android Wear takes the next step, doing away the hassle of search by surfacing the stuff you need automatically when you need it.\r\n\r\nIt’s this second, more intimate type of context that Google is so uniquely poised to conquer, according to Nick de la Mare, principal of design consultancy Big Tomorrow. De la Mare, who worked extensively on wearable projects as Executive Creator Director at Frog, sees Android Wear signaling a move to contextually-driven simplicity over the “maximalist,” computer-on-your-wrist approach of watches like the Galaxy Gear.\r\n\r\n“There are very few companies that have that repository of data to provide that simplicity,” de la Mare says. “Google is one of the only organizations that can take the management away from you and provide something meaningful.”\r\n\r\nRevisiting Our Assumptions About Apps\r\n\r\nImage: Google\r\n\r\nImage: Google\r\n\r\nContextual awareness is the key to a functionally robust smartwatch. What will make one truly useful, however, is how easy it is to use. The metric for success is simple: for a smartwatch to make sense, it has to let you do things more quickly than you could by pulling your smartphone out of your pocket.\r\n\r\nThis is where a lightweight user interface is key, and it seems like Google’s got a promising foundation, mixing concise, swipe-able cards with optional voice commands. From one perspective, it’s the logical continuation of the card-based UI that took root with Google Now. From a different viewpoint, however, it’s something considerably more radical: a reinvention of mobile apps as we know them.\r\n\r\nThe Android Wear UI is based on two core functions: “suggest” and “demand.” Suggest is the term Google uses for all the notification cards that make up the watch’s “context stream.” These could include urgent notifications, like text messages, that buzz your wrist when they come in, or morsels of data that get silently added to your stack, like scores of sports games.\r\n\r\nBut these aren’t “notifications” in the smartphone sense–hollering flags that pull you back into a third-party app. On the watch, they serve as the apps themselves. Google lays out strict guidelines for how these should work: “Omit needless text from your notifications. Design for glance-ability, not reading. Use words and phrases, not sentences. Show, don’t tell: Where possible use simple icons, glyphs, and visualizations to convey your message.”\r\nA smartwatch has to let you do things more quickly than you could by pulling your smartphone out.\r\n\r\nNotifications can be supplemented with additional “pages,” which people can access by swiping sideways on their smartwatch screen. These can add additional information or actions users can take on the data. The example Google gives is a reminder for a picnic. The notification itself reminds you that you have a picnic scheduled with a friend; the next page tells you that you’re responsible for bringing bread, cheese, and wine; and the third gives you a button for navigating to the spot.\r\n\r\nIt’s worth reiterating: This is Google’s idea of a smartwatch app. Timely notifications and relevant actions, all bundled up in a relatively strict visual language. Apps, in this vision, become much more homogenized; they’re about utility, service, information and action more than anything else. In this new model, you don’t tap icons to summon apps. Instead, they just pop up when you need them, triggered by contextual cues like location, time, or activity.\r\n\r\nThe other part of the Android Wear interface is “demand,” encompassing something Google refers to as the “Cue Card.” This is a list of commands that can be spoken or tapped on screen. From the look of things, it seems like these will include a preset list of actions for calling cabs, taking notes, sending messages, setting alarms and the like. These can either be triggered by tapping the screen, or by saying the command aloud. In Android Wear, apps aren’t to be thought of as discrete programs but rather as actions you can take.\r\n\r\nHere’s an important bit: Google’s developer documents state that users will be able to choose which app corresponds to these demands. This is where Google’s willingness to let users choose could be a huge boon to their smartwatch efforts. Presumably you could pick whether saying “call me a cab” triggers Uber, say, or Lyft. ', 10, '2014-03-31 11:04:54', 2013),
('Wikipedia', 'Cobra', NULL, 2, '2014-04-01 10:17:56', 'Cobra ( pronunciation (help·info)) is the Portuguese word for `snake`. In English and some other languages it has been adopted as the name for any of various species of venomous snakes. Most of those species are in the family Elapidae, all of which are venomous. Most of them can spread their neck ribs to form a flattened, widened hood.\r\nNot all snakes commonly referred to as cobras are of the same genus, or even in the family Elapidae. The name `cobra` is short for cobra de capelo or cobra-de-capelo, which is Portuguese for `snake with hood`, or `hood-snake`.[1] In some modern languages, such as Afrikaans, the other part of the Portuguese name was adopted, and the predominant name for a cobra in Afrikaans is `kapel`.[2][3] When disturbed, most of these snakes rear up and spread their necks (or hoods) in a characteristic threat display, making them a favorite of snake charmers because of the dramatic effect. Long ago, snake charming used to be a religious ritual, though nowadays it has become an entertainment. Cobras, which may live up to 20 years, are found from southern Africa, through southern Asia, to some of the islands of Southeast Asia.\r\nCobra may refer to:\r\nNaja, also known as typical or `true` cobras (known for raising the front part of the body and flattening the neck in a warning signal when alarmed), a group of elapids found in Africa and Asia. They include over 20 species, including Naja nivea, the cape, a moderately sized, highly venomous cobra inhabiting a wide variety of biomes across southern Africa, Cleopatra''s `asp` (the Egyptian cobra, Naja haje) and the Asiatic spectacled cobra Naja naja and monocled cobra, Naja kaouthia.\r\nSpitting cobras, a subset of Naja species with the ability to squirt venom from their fangs in self-defense\r\nHemachatus haemachatus, ringhals, rinkhals or ring-necked spitting cobra, a species of the Elapidae found in Africa\r\nAny member of the genus Boulengerina, the water cobras, a group of Elapidae found in Africa (now regarded as species in the genus Naja)\r\nParanaja multifasciata, the burrowing cobra, an African species of the Elapidae (now regarded as a species of Naja)\r\nAny member of the genus Aspidelaps, the shield cobras,[4] an African genus in the Elapidae\r\nAny species of Pseudohaje, the tree cobras, a genus of African Elapidae\r\nOphiophagus hannah, the king cobra, an elapid found in parts of India and southern Asia\r\nMicrurus fulvius, the American cobra or eastern coral snake, a species of the Elapidae found in the southeastern United States\r\nHydrodynastes gigas, the false water cobra, a mildly venomous member of the family Colubridae. It is indigenous to parts of South America and forms a hood if disturbed, though the hood is longer and narrower than those of `true` cobras in the Elapidae.\r\nNot a common name, but a highly obsolete synonym for the genus Bitis, the adders, a group of venomous vipers found in Africa and parts of the Middle East.\r\nMost so-called, and all `true`, species of cobras belong to the family Elapidae. So do many other notoriously venomous snake species, including mambas, sea snakes, and coral snakes. The genus Naja contains over twenty species of cobras and is the most widespread and widely recognized genus of cobras, sometimes called the `true` cobras. Members of the genus range from Africa through the Middle East, India, and Southeast Asia to Indonesia.\r\nAlthough the king cobra, Ophiophagus hannah, the world’s longest venomous snake, is a member of the Elapidae and can raise a rather narrow hood if disturbed, it is not in the genus Naja and accordingly is not a true cobra.\r\nThe other cobra of Asia is known as Asian, Indian or Spectacled cobra due to the eyeglass-shaped pattern on its skin. The hood of the Asian cobra is larger than that of the king cobra and is usually yellow or brown with a black and white spectacle pattern on top and two black and white spots on the lower surface.\r\nThe Rinkhals, Hemachatus haemachatus also called a spitting cobra, is endemic to southern Africa. It also is not in the genus Naja\r\nAlthough the bites of some species are extremely dangerous, cobras of any kind have not been shown to attack people unprovoked, and practically never without a threat display, typically raising the hood and hissing.\r\nVarious species of cobras prey mainly on other snakes, birds and small mammals, while its main natural predators in turn are other snakes, birds of prey, and small predatory mammals such as mongooses.\r\nAlthough most cobras don''t make nests, some species protect their eggs until they hatch (incubation typically taking around 60 days).', 26, '2014-04-01 10:18:54', 2014),
('bouke', 'succesful', NULL, 3, '2014-04-01 21:01:08', 'Information retrieval is the activity of obtaining information resources relevant to an information need from a collection of information resources. Searches can be based on metadata or on full-text (or other content-based) indexing.\r\nAutomated information retrieval systems are used to reduce what has been called "information overload". Many universities and public libraries use IR systems to provide access to books, journals and other documents. Web search engines are the most visible IR applications.\r\nContents [hide] \r\n1 Overview\r\n2 History\r\n3 Model types\r\n3.1 First dimension: mathematical basis\r\n3.2 Second dimension: properties of the model\r\n4 Performance and correctness measures\r\n4.1 Precision\r\n4.2 Recall\r\n4.3 Fall-out\r\n4.4 F-measure\r\n4.5 Average precision\r\n4.6 R-Precision\r\n4.7 Mean average precision\r\n4.8 Discounted cumulative gain\r\n4.9 Other Measures\r\n4.10 Timeline\r\n5 Awards in the field\r\n6 See also\r\n7 References\r\n8 External links\r\nOverview[edit]\r\n\r\nAn information retrieval process begins when a user enters a query into the system. Queries are formal statements of information needs, for example search strings in web search engines. In information retrieval a query does not uniquely identify a single object in the collection. Instead, several objects may match the query, perhaps with different degrees of relevancy.\r\nAn object is an entity that is represented by information in a database. User queries are matched against the database information. Depending on the application the data objects may be, for example, text documents, images,[1] audio,[2] mind maps[3] or videos. Often the documents themselves are not kept or stored directly in the IR system, but are instead represented in the system by document surrogates or metadata.\r\nMost IR systems compute a numeric score on how well each object in the database matches the query, and rank the objects according to this value. The top ranking objects are then shown to the user. The process may then be iterated if the user wishes to refine the query.[4]\r\nHistory[edit]\r\n\r\n But do you know that, although I have kept the diary [on a phonograph] for months past, it never once struck me how I was going to find any particular part of it in case I wanted to look it up? \r\nDr Seward, Bram Stoker's Dracula, 1897\r\nThe idea of using computers to search for relevant pieces of information was popularized in the article As We May Think by Vannevar Bush in 1945.[5] The first automated information retrieval systems were introduced in the 1950s and 1960s. By 1970 several different techniques had been shown to perform well on small text corpora such as the Cranfield collection (several thousand documents).[5] Large-scale retrieval systems, such as the Lockheed Dialog system, came into use early in the 1970s.\r\nIn 1992, the US Department of Defense along with the National Institute of Standards and Technology (NIST), cosponsored the Text Retrieval Conference (TREC) as part of the TIPSTER text program. The aim of this was to look into the information retrieval community by supplying the infrastructure that was needed for evaluation of text retrieval methodologies on a very large text collection. This catalyzed research on methods that scale to huge corpora. The introduction of web search engines has boosted the need for very large scale retrieval systems even further.\r\nModel types[edit]\r\n\r\n\r\n\r\nCategorization of IR-models (translated from German entry, original source Dominik Kuropka).\r\nFor effectively retrieving relevant documents by IR strategies, the documents are typically transformed into a suitable representation. Each retrieval strategy incorporate a specific model for its document representation purposes. The picture on the right illustrates the relationship of some common models. In the picture, the models are categorized according to two dimensions: the mathematical basis and the properties of the model.\r\nFirst dimension: mathematical basis[edit]\r\nSet-theoretic models represent documents as sets of words or phrases. Similarities are usually derived from set-theoretic operations on those sets. Common models are:\r\nStandard Boolean model\r\nExtended Boolean model\r\nFuzzy retrieval\r\nAlgebraic models represent documents and queries usually as vectors, matrices, or tuples. The similarity of the query vector and document vector is represented as a scalar value.\r\nVector space model\r\nGeneralized vector space model\r\n(Enhanced) Topic-based Vector Space Model\r\nExtended Boolean model\r\nLatent semantic indexing aka latent semantic analysis\r\nProbabilistic models treat the process of document retrieval as a probabilistic inference. Similarities are computed as probabilities that a document is relevant for a given query. Probabilistic theorems like the Bayes' theorem are often used in these models.\r\nBinary Independence Model\r\nProbabilistic relevance model on which is based the okapi (BM25) relevance function\r\nUncertain inference\r\nLanguage models\r\nDivergence-from-randomness model\r\nLatent Dirichlet allocation\r\nFeature-based retrieval models view documents as vectors of values of feature functions (or just features) and seek the best way to combine these features into a single relevance score, typically by learning to rank methods. Feature functions are arbitrary functions of document and query, and as such can easily incorporate almost any other retrieval model as just a yet another feature.\r\nSecond dimension: properties of the model[edit]\r\nModels without term-interdependencies treat different terms/words as independent. This fact is usually represented in vector space models by the orthogonality assumption of term vectors or in probabilistic models by an independency assumption for term variables.\r\nModels with immanent term interdependencies allow a representation of interdependencies between terms. However the degree of the interdependency between two terms is defined by the model itself. It is usually directly or indirectly derived (e.g. by dimensional reduction) from the co-occurrence of those terms in the whole set of documents.\r\nModels with transcendent term interdependencies allow a representation of interdependencies between terms, but they do not allege how the interdependency between two terms is defined. They relay an external source for the degree of interdependency between two terms. (For example a human or sophisticated algorithms.)\r\nPerformance and correctness measures[edit]\r\n\r\nMain article: Precision and recall\r\nMany different measures for evaluating the performance of information retrieval systems have been proposed. The measures require a collection of documents and a query. All common measures described here assume a ground truth notion of relevancy: every document is known to be either relevant or non-relevant to a particular query. In practice queries may be ill-posed and there may be different shades of relevancy.\r\nPrecision[edit]\r\nPrecision is the fraction of the documents retrieved that are relevant to the user's information need.\r\n \\mbox{precision}=\\frac{|\\{\\mbox{relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{retrieved documents}\\}|} \r\nIn binary classification, precision is analogous to positive predictive value. Precision takes all retrieved documents into account. It can also be evaluated at a given cut-off rank, considering only the topmost results returned by the system. This measure is called precision at n or P@n.\r\nNote that the meaning and usage of "precision" in the field of Information Retrieval differs from the definition of accuracy and precision within other branches of science and statistics.\r\nRecall[edit]\r\nRecall is the fraction of the documents that are relevant to the query that are successfully retrieved.\r\n\\mbox{recall}=\\frac{|\\{\\mbox{relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{relevant documents}\\}|} \r\nIn binary classification, recall is often called sensitivity. So it can be looked at as the probability that a relevant document is retrieved by the query.\r\nIt is trivial to achieve recall of 100% by returning all documents in response to any query. Therefore recall alone is not enough but one needs to measure the number of non-relevant documents also, for example by computing the precision.\r\nFall-out[edit]\r\nThe proportion of non-relevant documents that are retrieved, out of all non-relevant documents available:\r\n \\mbox{fall-out}=\\frac{|\\{\\mbox{non-relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{non-relevant documents}\\}|} \r\nIn binary classification, fall-out is closely related to specificity and is equal to (1-\\mbox{specificity}). It can be looked at as the probability that a non-relevant document is retrieved by the query.\r\nIt is trivial to achieve fall-out of 0% by returning zero documents in response to any query.\r\nF-measure[edit]\r\nMain article: F-score\r\nThe weighted harmonic mean of precision and recall, the traditional F-measure or balanced F-score is:\r\nF = \\frac{2 \\cdot \\mathrm{precision} \\cdot \\mathrm{recall}}{(\\mathrm{precision} + \\mathrm{recall})}.\\,\r\nThis is also known as the F_1 measure, because recall and precision are evenly weighted.\r\nThe general formula for non-negative real \\beta is:\r\nF_\\beta = \\frac{(1 + \\beta^2) \\cdot (\\mathrm{precision} \\cdot \\mathrm{recall})}{(\\beta^2 \\cdot \\mathrm{precision} + \\mathrm{recall})}\\,.\r\nTwo other commonly used F measures are the F_{2} measure, which weights recall twice as much as precision, and the F_{0.5} measure, which weights precision twice as much as recall.\r\nThe F-measure was derived by van Rijsbergen (1979) so that F_\\beta "measures the effectiveness of retrieval with respect to a user who attaches \\beta times as much importance to recall as precision". It is based on van Rijsbergen's effectiveness measure E = 1 - \\frac{1}{\\frac{\\alpha}{P} + \\frac{1-\\alpha}{R}}. Their relationship is F_\\beta = 1 - E where \\alpha=\\frac{1}{1 + \\beta^2}.\r\nAverage precision[edit]\r\nPrecision and recall are single-value metrics based on the whole list of documents returned by the system. For systems that return a ranked sequence of documents, it is desirable to also consider the order in which the returned documents are presented. By computing a precision and recall at every position in the ranked sequence of documents, one can plot a precision-recall curve, plotting precision p(r) as a function of recall r. Average precision computes the average value of p(r) over the interval from r=0 to r=1:[6]\r\n\\operatorname{AveP} = \\int_0^1 p(r)dr\r\nThat is the area under the precision-recall curve. This integral is in practice replaced with a finite sum over every position in the ranked sequence of documents:\r\n\\operatorname{AveP} = \\sum_{k=1}^n P(k) \\Delta r(k)\r\nwhere k is the rank in the sequence of retrieved documents, n is the number of retrieved documents, P(k) is the precision at cut-off k in the list, and \\Delta r(k) is the change in recall from items k-1 to k.[6]\r\nThis finite sum is equivalent to:\r\n \\operatorname{AveP} = \\frac{\\sum_{k=1}^n (P(k) \\times \\operatorname{rel}(k))}{\\mbox{number of relevant documents}} \\!\r\nwhere \\operatorname{rel}(k) is an indicator function equaling 1 if the item at rank k is a relevant document, zero otherwise.[7] Note that the average is over all relevant documents and the relevant documents not retrieved get a precision score of zero.\r\nSome authors choose to interpolate the p(r) function to reduce the impact of "wiggles" in the curve.[8][9] For example, the PASCAL Visual Object Classes challenge (a benchmark for computer vision object detection) computes average precision by averaging the precision over a set of evenly spaced recall levels {0, 0.1, 0.2, ... 1.0}:[8][9]\r\n\\operatorname{AveP} = \\frac{1}{11} \\sum_{r \\in \\{0, 0.1, \\ldots, 1.0\\}} p_{\\operatorname{interp}}(r)\r\nwhere p_{\\operatorname{interp}}(r) is an interpolated precision that takes the maximum precision over all recalls greater than r:\r\np_{\\operatorname{interp}}(r) = \\operatorname{max}_{\\tilde{r}:\\tilde{r} \\geq r} p(\\tilde{r}).\r\nAn alternative is to derive an analytical p(r) function by assuming a particular parametric distribution for the underlying decision values. For example, a binormal precision-recall curve can be obtained by assuming decision values in both classes to follow a Gaussian distribution.[10]\r\nR-Precision[edit]\r\nPrecision at R-th position in the ranking of results for a query that has R relevant documents. This measure is highly correlated to Average Precision. Also, Precision is equal to Recall at the R-th position.\r\nMean average precision[edit]\r\nMean average precision for a set of queries is the mean of the average precision scores for each query.\r\n \\operatorname{MAP} = \\frac{\\sum_{q=1}^Q \\operatorname{AveP(q)}}{Q} \\!\r\nwhere Q is the number of queries.\r\nDiscounted cumulative gain[edit]\r\nMain article: Discounted cumulative gain\r\nDCG uses a graded relevance scale of documents from the result set to evaluate the usefulness, or gain, of a document based on its position in the result list. The premise of DCG is that highly relevant documents appearing lower in a search result list should be penalized as the graded relevance value is reduced logarithmically proportional to the position of the result.\r\nThe DCG accumulated at a particular rank position p is defined as:\r\n \\mathrm{DCG_{p}} = rel_{1} + \\sum_{i=2}^{p} \\frac{rel_{i}}{\\log_{2}i}. \r\nSince result set may vary in size among different queries or systems, to compare performances the normalised version of DCG uses an ideal DCG. To this end, it sorts documents of a result list by relevance, producing an ideal DCG at position p (IDCG_p), which normalizes the score:\r\n \\mathrm{nDCG_{p}} = \\frac{DCG_{p}}{IDCG{p}}. \r\nThe nDCG values for all queries can be averaged to obtain a measure of the average performance of a ranking algorithm. Note that in a perfect ranking algorithm, the DCG_p will be the same as the IDCG_p producing an nDCG of 1.0. All nDCG calculations are then relative values on the interval 0.0 to 1.0 and so are cross-query comparable.\r\nOther Measures[edit]\r\nMean reciprocal rank\r\nSpearman's rank correlation coefficient\r\nTimeline[edit]\r\nBefore the 1900s\r\n1801: Joseph Marie Jacquard invents the Jacquard loom, the first machine to use punched cards to control a sequence of operations.\r\n1880s: Herman Hollerith invents an electro-mechanical data tabulator using punch cards as a machine readable medium.\r\n1890 Hollerith cards, keypunches and tabulators used to process the 1890 US Census data.\r\n1920s-1930s\r\nEmanuel Goldberg submits patents for his "Statistical Machine a document search engine that used photoelectric cells and pattern recognition to search the metadata on rolls of microfilmed documents.\r\n1940s1950s\r\nlate 1940s: The US military confronted problems of indexing and retrieval of wartime scientific research documents captured from Germans.\r\n1945: Vannevar Bush's As We May Think appeared in Atlantic Monthly.\r\n1947: Hans Peter Luhn (research engineer at IBM since 1941) began work on a mechanized punch card-based system for searching chemical compounds.\r\n1950s: Growing concern in the US for a "science gap" with the USSR motivated, encouraged funding and provided a backdrop for mechanized literature searching systems (Allen Kent et al.) and the invention of citation indexing (Eugene Garfield).\r\n1950: The term "information retrieval" appears to have been coined by Calvin Mooers.[11]\r\n1951: Philip Bagley conducted the earliest experiment in computerized document retrieval in a master thesis at MIT.[12]\r\n1955: Allen Kent joined Case Western Reserve University, and eventually became associate director of the Center for Documentation and Communications Research. That same year, Kent and colleagues published a paper in American Documentation describing the precision and recall measures as well as detailing a proposed "framework" for evaluating an IR system which included statistical sampling methods for determining the number of relevant documents not retrieved.\r\n1958: International Conference on Scientific Information Washington DC included consideration of IR systems as a solution to problems identified. See: Proceedings of the International Conference on Scientific Information, 1958 (National Academy of Sciences, Washington, DC, 1959)\r\n1959: Hans Peter Luhn published "Auto-encoding of documents for information retrieval."\r\n1960s:\r\nearly 1960s: Gerard Salton began work on IR at Harvard, later moved to Cornell.\r\n1960: Melvin Earl Maron and John Lary Kuhns[13] published "On relevance, probabilistic indexing, and information retrieval" in the Journal of the ACM 7(3):216244, July 1960.\r\n1962:\r\nCyril W. Cleverdon published early findings of the Cranfield studies, developing a model for IR system evaluation. See: Cyril W. Cleverdon, "Report on the Testing and Analysis of an Investigation into the Comparative Efficiency of Indexing Systems". Cranfield Collection of Aeronautics, Cranfield, England, 1962.\r\nKent published Information Analysis and Retrieval.\r\n1963:\r\nWeinberg report "Science, Government and Information" gave a full articulation of the idea of a "crisis of scientific information." The report was named after Dr. Alvin Weinberg.\r\nJoseph Becker and Robert M. Hayes published text on information retrieval. Becker, Joseph; Hayes, Robert Mayo. Information storage and retrieval: tools, elements, theories. New York, Wiley (1963).\r\n1964:\r\nKaren Sprck Jones finished her thesis at Cambridge, Synonymy and Semantic Classification, and continued work on computational linguistics as it applies to IR.\r\nThe National Bureau of Standards sponsored a symposium titled "Statistical Association Methods for Mechanized Documentation." Several highly significant papers, including G. Salton's first published reference (we believe) to the SMART system.\r\nmid-1960s:\r\nNational Library of Medicine developed MEDLARS Medical Literature Analysis and Retrieval System, the first major machine-readable database and batch-retrieval system.\r\nProject Intrex at MIT.\r\n1965: J. C. R. Licklider published Libraries of the Future.\r\n1966: Don Swanson was involved in studies at University of Chicago on Requirements for Future Catalogs.\r\nlate 1960s: F. Wilfrid Lancaster completed evaluation studies of the MEDLARS system and published the first edition of his text on information retrieval.\r\n1968:\r\nGerard Salton published Automatic Information Organization and Retrieval.\r\nJohn W. Sammon, Jr.'s RADC Tech report "Some Mathematics of Information Storage and Retrieval..." outlined the vector model.\r\n1969: Sammon's "A nonlinear mapping for data structure analysis" (IEEE Transactions on Computers) was the first proposal for visualization interface to an IR system.\r\n1970s\r\nearly 1970s:\r\nFirst online systemsNLM's AIM-TWX, MEDLINE; Lockheed's Dialog; SDC's ORBIT.\r\nTheodor Nelson promoting concept of hypertext, published Computer Lib/Dream Machines.\r\n1971: Nicholas Jardine and Cornelis J. van Rijsbergen published "The use of hierarchic clustering in information retrieval", which articulated the "cluster hypothesis."[14]\r\n1975: Three highly influential publications by Salton fully articulated his vector processing framework and term discrimination model:\r\nA Theory of Indexing (Society for Industrial and Applied Mathematics)\r\nA Theory of Term Importance in Automatic Text Analysis (JASIS v. 26)\r\nA Vector Space Model for Automatic Indexing (CACM 18:11)\r\n1978: The First ACM SIGIR conference.\r\n1979: C. J. van Rijsbergen published Information Retrieval (Butterworths). Heavy emphasis on probabilistic models.\r\n1980s\r\n1980: First international ACM SIGIR conference, joint with British Computer Society IR group in Cambridge.\r\n1982: Nicholas J. Belkin, Robert N. Oddy, and Helen M. Brooks proposed the ASK (Anomalous State of Knowledge) viewpoint for information retrieval. This was an important concept, though their automated analysis tool proved ultimately disappointing.\r\n1983: Salton (and Michael J. McGill) published Introduction to Modern Information Retrieval (McGraw-Hill), with heavy emphasis on vector models.\r\n1985: David Blair and Bill Maron publish: An Evaluation of Retrieval Effectiveness for a Full-Text Document-Retrieval System\r\nmid-1980s: Efforts to develop end-user versions of commercial IR systems.\r\n19851993: Key papers on and experimental systems for visualization interfaces.\r\nWork by Donald B. Crouch, Robert R. Korfhage, Matthew Chalmers, Anselm Spoerri and others.\r\n1989: First World Wide Web proposals by Tim Berners-Lee at CERN.\r\n1990s\r\n1992: First TREC conference.\r\n1997: Publication of Korfhage's Information Storage and Retrieval[15] with emphasis on visualization and multi-reference point systems.\r\nlate 1990s: Web search engines implementation of many features formerly found only in experimental IR systems. Search engines become the most common and maybe best instantiation of IR models, research, and implement by Jocanz.\r\nAwards in the field[edit]\r\n\r\nTony Kent Strix award\r\nGerard Salton Award\r\nSee also[edit]\r\n\r\nAdversarial information retrieval\r\nCollaborative information seeking\r\nControlled vocabulary\r\nCross-language information retrieval\r\nData mining\r\nEuropean Summer School in Information Retrieval\r\nHumancomputer information retrieval\r\nInformation extraction\r\nInformation Retrieval Facility\r\nKnowledge visualization\r\nMultimedia Information Retrieval\r\nList of information retrieval libraries\r\nPersonal information management\r\nRelevance (Information Retrieval)\r\nRelevance feedback\r\nRocchio Classification\r\nSearch index\r\nSocial information seeking\r\nSpecial Interest Group on Information Retrieval\r\nStructured Search\r\nSubject indexing\r\nTemporal information retrieval\r\nTf-idf\r\nXML-Retrieval\r\nKey-objects\r\nReferences[edit]\r\n\r\n\r\n Wikiquote has a collection of quotations related to: Information retrieval\r\nACM SIGIR: Information Retrieval Special Interest Group\r\nBCS IRSG: British Computer Society - Information Retrieval Specialist Group\r\nText Retrieval Conference (TREC)\r\nForum for Information Retrieval Evaluation (FIRE)\r\nInformation Retrieval (online book) by C. J. van Rijsbergen\r\nInformation Retrieval Wiki\r\nInformation Retrieval Facility\r\nInformation Retrieval @ DUTH\r\nIntroduction to Information Retrieval (online book) by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schtze, Cambridge University Press. 2008.\r\nCategories: Information retrievalNatural language processing\r\nNavigation menu\r\nCreate accountLog inArticleTalkReadEditView history\r\n\r\nEdit links\r\nThis page was last modified on 17 February 2014 at 05:19.\r\nText is available under the Creative Commons Attribution-ShareAlike License; additional terms may apply. By using this site, you agree to the Terms of Use and Privacy Policy. \r\nWikipedia is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.\r\nPrivacy policyAbout WikipediaDisclaimersContact WikipediaDevelopersMobile viewWikimedia Foundation Powered by MediaWiki', 80, '2014-04-01 21:01:27', 2014),
(' George Smith, Naomi Moore, John Wilson', 'Defining Climate Change', NULL, 2, '2014-04-01 21:30:51', ' \r\nClimate change is a significant and lasting change in the statistical distribution of weather patterns over periods ranging from decades to millions of years. It may be a change in average weather conditions, or in the distribution of weather around the average conditions (i.e., more or fewer extreme weather events). Climate change is caused by factors such as biotic processes, variations in solar radiation received by Earth, plate tectonics, and volcanic eruptions. Certain human activities have also been identified as significant causes of recent climate change, often referred to as "global warming".\r\n \r\nScientists actively work to understand past and future climate by using observations and theoretical models. A climate record extending deep into the Earth's past has been assembled, and continues to be built up, based on geological evidence from borehole temperature profiles, cores removed from deep accumulations of ice, floral and faunal records, glacial and periglacial processes, stable-isotope and other analyses of sediment layers, and records of past sea levels. More recent data are provided by the instrumental record. General circulation models, based on the physical sciences, are often used in theoretical approaches to match past climate data, make future projections, and link causes and effects in climate change.\r\n \r\nThe most general definition of climate change is a change in the statistical properties of the climate system when considered over long periods of time, regardless of cause. Accordingly, fluctuations over periods shorter than a few decades, such as El Nio, do not represent climate change.\r\nThe term sometimes is used to refer specifically to climate change caused by human activity, as opposed to changes in climate that may have resulted as part of Earth's natural processes. In this sense, especially in the context of environmental policy, the term climate change has become synonymous with anthropogenic global warming. Within scientific journals, global warming refers to surface temperature increases while climate change includes global warming and everything else that increasing greenhouse gas levels will affect.\r\n \r\nOn the broadest scale, the rate at which energy is received from the sun and the rate at which it is lost to space determine the equilibrium temperature and climate of Earth. This energy is distributed around the globe by winds, ocean currents, and other mechanisms to affect the climates of different regions.\r\n \r\nFactors that can shape climate are called climate forcings or "forcing mechanisms". These include processes such as variations in solar radiation, variations in the Earth's orbit, mountain-building and continental drift and changes in greenhouse gas concentrations. There are a variety of climate change feedbacks that can either amplify or diminish the initial forcing. Some parts of the climate system, such as the oceans and ice caps, respond slowly in reaction to climate forcings, while others respond more quickly.\r\n \r\nForcing mechanisms can be either "internal" or "external". Internal forcing mechanisms are natural processes within the climate system itself (e.g., the thermohaline circulation). External forcing mechanisms can be either natural (e.g., changes in solar output) or anthropogenic (e.g., increased emissions of greenhouse gases).\r\n \r\nWhether the initial forcing mechanism is internal or external, the response of the climate system might be fast (e.g., a sudden cooling due to airborne volcanic ash reflecting sunlight), slow (e.g. thermal expansion of warming ocean water), or a combination (e.g., sudden loss of albedo in the arctic ocean as sea ice melts, followed by more gradual thermal expansion of the water). Therefore, the climate system can respond abruptly, but the full response to forcing mechanisms might not be fully developed for centuries or even longer.\r\n', 81, '2014-04-01 21:31:25', 2010),
('Thomas Garcia', 'How climate change affects human life', NULL, 2, '2014-04-01 21:32:33', 'If you think of climate change as a hazard for some far-off polar bears years from now, you're mistaken. That's the message from top climate scientists gathering in Japan this week to assess the impact of global warming. In fact, they will say, the dangers of a warming Earth are immediate and very human. "The polar bear is us," says Patricia Romero Lankao of the federally financed National Center for Atmospheric Research in Boulder, Colorado, referring to the first species to be listed as threatened by global warming due to melting sea ice. She will be among the more than 60 scientists in Japan to finish writing a massive and authoritative report on the impacts of global warming. With representatives from about 100 governments at this week's meeting of the Intergovernmental Panel on Climate Change, they'll wrap up a summary that tells world leaders how bad the problem is.\r\n \r\nThe key message from leaked drafts and interviews with the authors and other scientists: The big risks and overall effects of global warming are far more immediate and local than scientists once thought. It's not just about melting ice, threatened animals and plants. It's about the human problems of hunger, disease, drought, flooding, refugees and war, becoming worse.\r\n \r\nThe report says scientists have already observed many changes from warming, such as an increase in heat waves in North America, Europe, Africa and Asia. Severe floods, such as the one that displaced 90,000 people in Mozambique in 2008, are now more common in Africa and Australia. Europe and North America are getting more intense downpours that can be damaging. Melting ice in the Arctic is not only affecting the polar bear, but already changing the culture and livelihoods of indigenous people in northern Canada.\r\n \r\nPast panel reports have been ignored because global warming's effects seemed too distant in time and location, says Pennsylvania State University scientist Michael Mann. This report finds "It's not far-off in the future and it's not exotic creatures it's us and now," says Mann, who didn't work on this latest report.\r\n \r\nThe United Nations established the climate change panel in 1988 and its work is done by three groups. One looks at the science behind global warming. The group meeting in Japan beginning Tuesday studies its impacts. And a third looks at ways to slow warming. Its reports have reiterated what nearly every major scientific organization has said: The burning of coal, oil and gas is producing an increasing amount of heat-trapping greenhouse gases, such as carbon dioxide. Those gases change Earth's climate, bringing warmer temperatures and more extreme weather, and the problem is worsening.\r\n \r\nThe panel won the Nobel Peace Prize in 2007, months after it issued its last report. Since then, the impact group has been reviewing the latest research and writing 30 chapters on warming's effects and regional impacts. Those chapters haven't been officially released but were posted on a skeptical website. The key message can be summed up in one word that the overall report uses more than 5,000 times: risk. "Climate change really is a challenge in managing risks," says the report's chief author, Chris Field of the Carnegie Institution of Science in California. "It's very clear that we are not prepared for the kind of events we're seeing."\r\n \r\nAlready the effects of global warming are "widespread and consequential," says one part of the larger report, noting that science has compiled more evidence and done much more research since the last report in 2007.\r\n \r\nIf climate change continues, the panel's larger report predicts these harms:\r\n \r\nVIOLENCE: For the first time, the panel is emphasizing the nuanced link between conflict and warming temperatures. Participating scientists say warming won't cause wars, but it will add a destabilizing factor that will make existing threats worse.\r\n \r\nFOOD: Global food prices will rise between 3 and 84 percent by 2050 because of warmer temperatures and changes in rain patterns. Hotspots of hunger may emerge in cities.\r\n \r\nWATER: About one-third of the world's population will see groundwater supplies drop by more than 10 percent by 2080, when compared with 1980 levels. For every degree of warming, more of the world will have significantly less water available.\r\n \r\nHEALTH: Major increases in health problems are likely, with more illnesses and injury from heat waves and fires and more food and water-borne diseases. But the report also notes that warming's effects on health is relatively small compared with other problems, like poverty.\r\n \r\nWEALTH: Many of the poor will get poorer. Economic growth and poverty reduction will slow down. If temperatures rise high enough, the world's overall income may start to go down, by as much as 2 percent, but that's difficult to forecast.\r\n \r\nAccording to the report, risks from warming-related extreme weather, now at a moderate level, are likely to get worse with just a bit more warming. While it doesn't say climate change caused the events, the report cites droughts in northern Mexico and the south-central United States, and hurricanes such as 2012's Sandy, as illustrations of how vulnerable people are to weather extremes. It does say the deadly European heat wave in 2003 was made more likely because of global warming.\r\n \r\nTexas Tech University climate scientist Katharine Hayhoe, who was not part of this report team, says the important nuance is how climate change interacts with other human problems: "It's interacting and exacerbating problems we already have today. "University of Colorado science policy professor Roger Pielke Jr., a past critic of the panel's impact reports, said after reading the draft summary, "it's a lot of important work ... They made vast improvements to the quality of their assessments." Another critic, University of Alabama Huntsville professor John Christy, accepts man-made global warming but thinks its risks are overblown when compared with something like poverty. Climate change is not among the developing world's main problems, he says.\r\n \r\nBut other scientists say Christy is misguided. Earlier this month, the world's largest scientific organization, the American Association for the Advancement of Science, published a new fact sheet on global warming. It said: "Climate change is already happening. More heat waves, greater sea level rise and other changes with consequences for human health, natural ecosystems and agriculture are already occurring in the United States and worldwide. These problems are very likely to become worse over the next 10 to 20 years and beyond." Texas Tech's Hayhoe says scientists in the past may have created the impression that the main reason to care about climate change was its impact on the environment. "We care about it because it's going to affect nearly every aspect of human life on this planet," she says.', 82, '2014-04-01 21:32:46', 2005),
('Anne Buchanan', 'Is complex disease risk predictable, or even parseable?', NULL, NULL, '2014-04-01 22:05:34', 'A very smart friend of ours tells us that he likes our posts about the problems that are impeding the kind of progress toward explaining and predicting disease that are major goals of genetics and epidemiology. What causes asthma, or heart disease, or obesity, or hypertension or breast cancer? And, how can we know who is going to get these diseases? We have written a lot about why these questions are so hard to answer. But now our friend is asking us for a solution. \r\n\r\nPerhaps it does seem unfair to criticize without proposing a solution, though we (naturally) think otherwise. At risk of sounding like a broken record, as we've said before, asking the critic to solve the problem he or she is criticizing is like asking a jury who votes not to convict to identify the guilty party! We've also said before that the solution, if there is one, is going to come from some smart young person who sees the problems and beyond. Such insights can't be ordered up the way you order a burger at McDonald's; history shows that they arise unpredictably -- but in prepared minds.\r\n\r\nThe very successful early history of advances in genetics and epidemiology are telling, as they illuminate what it is about the diseases these fields still haven't explained that makes them so seemingly intractable. In retrospect, these fields picked the low-hanging fruit, which led to the assumption that all the fruit would be as easy to pluck. In some ways, rather arrogantly, investigators even trumpeted the fact that they were just picking the low fruit.\r\n\r\nThe same could be said for epidemiology -- infectious disease agents were, in retrospect, easy, and finding them led to major successes in treatment and prevention, with major improvements in public health. These were largely solutions to 'point cause' problems -- one exposure or an exposure to one thing was enough to generate the results. Infectious disease led the way, but single gene 'Mendelian' science wasn't too far behind -- though at first the actual gene was hard to identify even if the single-gene cause seemed clear.', 83, '2014-04-01 22:06:17', 2014);
-- --------------------------------------------------------
--
-- Table structure for table `keywords`
--
CREATE TABLE IF NOT EXISTS `keywords` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`document_id` int(11) DEFAULT NULL,
`keyword` text,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=92 ;
--
-- Dumping data for table `keywords`
--
INSERT INTO `keywords` (`id`, `document_id`, `keyword`) VALUES
(1, 67, 'inform'),
(2, 67, 'retriev'),
(3, 67, 'document'),
(4, 67, 'precis'),
(5, 67, 'system'),
(6, 67, 'vector'),
(7, 68, 'inform'),
(8, 68, 'retriev'),
(9, 68, 'document'),
(10, 68, 'precis'),
(11, 68, 'system'),
(12, 68, 'vector'),
(13, 69, 'inform'),
(14, 69, 'retriev'),
(15, 69, 'document'),
(16, 69, 'precis'),
(17, 69, 'system'),
(18, 69, 'vector'),
(19, 70, 'inform'),
(20, 70, 'retriev'),
(21, 70, 'document'),
(22, 70, 'precis'),
(23, 70, 'system'),
(24, 70, 'retrieval"'),
(25, 71, 'ametlorem'),
(26, 72, 'inform'),
(27, 72, 'retriev'),
(28, 72, 'document'),
(29, 72, 'precis'),
(30, 72, 'system'),
(31, 72, 'vector'),
(32, 73, 'ametlorem'),
(33, 73, 'ametlorem.'),
(34, 74, 'inform'),
(35, 74, 'retriev'),
(36, 74, 'document'),
(37, 74, 'precis'),
(38, 74, 'system'),
(39, 74, 'vector'),
(40, 75, 'inform'),
(41, 75, 'retriev'),
(42, 75, 'document'),
(43, 75, 'precis'),
(44, 75, 'system'),
(45, 75, 'vector'),
(46, 76, 'retriev'),
(47, 76, 'precis'),
(48, 76, 'vector'),
(49, 76, 'object'),
(50, 76, 'query.'),
(51, 76, 'result'),
(52, 78, 'retriev'),
(53, 78, 'precis'),
(54, 78, 'vector'),
(55, 78, 'object'),
(56, 78, 'query.'),
(57, 78, 'result'),
(58, 79, 'retriev'),
(59, 79, 'precis'),
(60, 79, 'vector'),
(61, 79, 'object'),
(62, 79, 'query.'),
(63, 79, 'result'),
(64, 80, 'retriev'),
(65, 80, 'precis'),
(66, 80, 'inform'),
(67, 80, 'document'),
(68, 80, 'vector'),
(69, 80, 'object'),
(70, 81, 'retriev'),
(71, 81, 'precis'),
(72, 81, 'inform'),
(73, 81, 'document'),
(74, 81, 'vector'),
(75, 81, 'object'),
(76, 81, 'climat'),
(77, 81, '(e.g.,'),
(78, 81, 'weather'),
(79, 81, 'volcan'),
(80, 81, 'years.'),
(81, 81, 'system'),
(82, 82, 'report'),
(83, 82, 'alreadi'),
(84, 82, 'climat'),
(85, 82, 'scientist'),
(86, 82, 'world''),
(87, 82, 'warming.'),
(88, 83, 'diseas'),
(89, 83, 'we'v'),
(90, 83, 'problem'),
(91, 83, 'without');
-- --------------------------------------------------------
--
-- Table structure for table `notes`
--
CREATE TABLE IF NOT EXISTS `notes` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`sentence_id` int(11) NOT NULL,
`note` text NOT NULL,
`user_id` int(11) NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=52 ;
--
-- Dumping data for table `notes`
--
INSERT INTO `notes` (`id`, `sentence_id`, `note`, `user_id`) VALUES
(48, 21877, 'testnote\n', 3),
(49, 21877, 'testnote\n', 23),
(50, 21877, 'testnote\n', 25),
(51, 21877, 'testnote\n', 25);
-- --------------------------------------------------------
--
-- Table structure for table `sentences`
--
CREATE TABLE IF NOT EXISTS `sentences` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`document_id` int(11) NOT NULL,
`sentence` text NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 AUTO_INCREMENT=22123 ;
--
-- Dumping data for table `sentences`
--
INSERT INTO `sentences` (`id`, `document_id`, `sentence`) VALUES
(16503, 10, 'Last week, Google unveiled its vision of the smartwatch, the elusive Next Big Gadget.'),
(16504, 10, 'It takes the form of Android Wear, a new version of the mobile operating system designed specifically for on-the-body devices.'),
(16505, 10, 'It''s a good deal more sophisticated than the smartwatches we''ve seen hitherto, relying on the company''s unparalleled voice recognition for registering simple commands and promising to serve up the ``info and suggestions you need, right when you need them'''' thanks to the same predictive, personalized algorithms that power Google Now on Android phones.'),
(16506, 10, '\r\n\r\n Amidst speculation that Apple''s long-fabled iWatch might in fact be a health-specific wristband, Android Wear is clearly aiming for something much bigger.'),
(16507, 10, 'And that makes sense.'),
(16508, 10, 'If there''s any company today that has a chance to make the multipurpose smartwatch we''ve all been dreaming of, it''s Google.'),
(16509, 10, 'But it''s not just heaps of data and algorithmic might that make Android Wear promising.'),
(16510, 10, 'It''s also Google''s approach to the endeavor--its willingness to let third-party developers deeper into the stack and, potentially, to let users define the experience for themselves--that could help make it a hit.'),
(16511, 10, '\r\n\r\n Context Is King Context is the holy grail of wearable devices.'),
(16512, 10, 'With the limited real estate of a watch face, knowing what app, service, prompt or data point a person needs at a specific moment becomes paramount.'),
(16513, 10, 'The shiny promotional videos Google released this week show how context plays out in Android Wear in a number of situations.'),
(16514, 10, 'On the bus, your smartwatch might show you the next few stops; if there''s a meeting coming up, it''ll remind you who it''s with, and offer directions for how to get there.'),
(16515, 10, 'The video suggests a few less obvious use cases, too.'),
(16516, 10, 'If your Android Wear watch feels itself shaking around and its microphone hears music, it might figure out that you''re dancing, and tell you what song''s playing.'),
(16517, 10, '\r\n\r\n But context isn''t just about using sensors to intuit your environment and activity.'),
(16518, 10, 'It''s also about tying your scattered digital existence to your actual, physical self.'),
(16519, 10, 'It''s about looking at your calendar, your inbox, and your contacts in concert, cross-referencing them, and coming away with a more human understanding of your schedule, your to-do list, and your circle of friends.'),
(16520, 10, 'When it was released in 2004, Gmail did away with the hassle of organizing email by letting you search through your inbox.'),
(16521, 10, 'At its best, a contextually-savvy operating system like Android Wear takes the next step, doing away the hassle of search by surfacing the stuff you need automatically when you need it.'),
(16522, 10, '\r\n\r\n It''s this second, more intimate type of context that Google is so uniquely poised to conquer, according to Nick de la Mare, principal of design consultancy Big Tomorrow.'),
(16523, 10, 'De la Mare, who worked extensively on wearable projects as Executive Creator Director at Frog, sees Android Wear signaling a move to contextually-driven simplicity over the ``maximalist,'''' computer-on-your-wrist approach of watches like the Galaxy Gear.'),
(16524, 10, '\r\n\r\n ``There are very few companies that have that repository of data to provide that simplicity,'''' de la Mare says.'),
(16525, 10, '``Google is one of the only organizations that can take the management away from you and provide something meaningful.'''''),
(16526, 10, '\r\n\r\n Revisiting Our Assumptions About Apps Image: Google Image: Google Contextual awareness is the key to a functionally robust smartwatch.'),
(16527, 10, 'What will make one truly useful, however, is how easy it is to use.'),
(16528, 10, 'The metric for success is simple: for a smartwatch to make sense, it has to let you do things more quickly than you could by pulling your smartphone out of your pocket.'),
(16529, 10, '\r\n\r\n This is where a lightweight user interface is key, and it seems like Google''s got a promising foundation, mixing concise, swipe-able cards with optional voice commands.'),
(16530, 10, 'From one perspective, it''s the logical continuation of the card-based UI that took root with Google Now.'),
(16531, 10, 'From a different viewpoint, however, it''s something considerably more radical: a reinvention of mobile apps as we know them.'),
(16532, 10, '\r\n\r\n The Android Wear UI is based on two core functions: ``suggest'''' and ``demand.'''''),
(16533, 10, 'Suggest is the term Google uses for all the notification cards that make up the watch''s ``context stream.'''''),
(16534, 10, 'These could include urgent notifications, like text messages, that buzz your wrist when they come in, or morsels of data that get silently added to your stack, like scores of sports games.'),
(16535, 10, '\r\n\r\n But these aren''t ``notifications'''' in the smartphone sense--hollering flags that pull you back into a third-party app.'),
(16536, 10, 'On the watch, they serve as the apps themselves.'),
(16537, 10, 'Google lays out strict guidelines for how these should work: ``Omit needless text from your notifications.'),
(16538, 10, 'Design for glance-ability, not reading.'),
(16539, 10, 'Use words and phrases, not sentences.'),
(16540, 10, 'Show, don''t tell: Where possible use simple icons, glyphs, and visualizations to convey your message.'''''),
(16541, 10, '\r\n\r\nA smartwatch has to let you do things more quickly than you could by pulling your smartphone out.'),
(16542, 10, '\r\n\r\n Notifications can be supplemented with additional ``pages,'''' which people can access by swiping sideways on their smartwatch screen.'),
(16543, 10, 'These can add additional information or actions users can take on the data.'),
(16544, 10, 'The example Google gives is a reminder for a picnic.'),
(16545, 10, 'The notification itself reminds you that you have a picnic scheduled with a friend; the next page tells you that you''re responsible for bringing bread, cheese, and wine; and the third gives you a button for navigating to the spot.'),
(16546, 10, '\r\n\r\n It''s worth reiterating: This is Google''s idea of a smartwatch app.'),
(16547, 10, 'Timely notifications and relevant actions, all bundled up in a relatively strict visual language.'),
(16548, 10, 'Apps, in this vision, become much more homogenized; they''re about utility, service, information and action more than anything else.'),
(16549, 10, 'In this new model, you don''t tap icons to summon apps.'),
(16550, 10, 'Instead, they just pop up when you need them, triggered by contextual cues like location, time, or activity.'),
(16551, 10, '\r\n\r\n The other part of the Android Wear interface is ``demand,'''' encompassing something Google refers to as the ``Cue Card.'''''),
(16552, 10, 'This is a list of commands that can be spoken or tapped on screen.'),
(16553, 10, 'From the look of things, it seems like these will include a preset list of actions for calling cabs, taking notes, sending messages, setting alarms and the like.'),
(16554, 10, 'These can either be triggered by tapping the screen, or by saying the command aloud.'),
(16555, 10, 'In Android Wear, apps aren''t to be thought of as discrete programs but rather as actions you can take.'),
(16556, 10, '\r\n\r\n Here''s an important bit: Google''s developer documents state that users will be able to choose which app corresponds to these demands.'),
(16557, 76, 'Information retrieval is the activity of obtaining information resources relevant to an information need from a collection of information resources.'),
(16558, 76, 'Searches can be based on metadata or on full-text (or other content-based) indexing.'),
(16559, 76, '\n\nAutomated information retrieval systems are used to reduce what has been called "information overload".'),
(16560, 76, 'Many universities and public libraries use IR systems to provide access to books, journals and other documents.'),
(16561, 76, 'Web search engines are the most visible IR applications.'),
(16562, 76, '\n\nContents [hide] 1 Overview 2 History 3 Model types 3.1 First dimension: mathematical basis 3.2 Second dimension: properties of the model 4 Performance and correctness measures 4.1 Precision 4.2 Recall 4.3 Fall-out 4.4 F-measure 4.5 Average precision 4.6 R-Precision 4.7 Mean average precision 4.8 Discounted cumulative gain 4.9 Other Measures 4.10 Timeline 5 Awards in the field 6 See also 7 References 8 External links Overview[edit] An information retrieval process begins when a user enters a query into the system.'),
(16563, 76, 'Queries are formal statements of information needs, for example search strings in web search engines.'),
(16564, 76, 'In information retrieval a query does not uniquely identify a single object in the collection.'),
(16565, 76, 'Instead, several objects may match the query, perhaps with different degrees of relevancy.'),
(16566, 76, '\n\nAn object is an entity that is represented by information in a database.'),
(16567, 76, 'User queries are matched against the database information.'),
(16568, 76, 'Depending on the application the data objects may be, for example, text documents, images,[1] audio,[2] mind maps[3] or videos.'),
(16569, 76, 'Often the documents themselves are not kept or stored directly in the IR system, but are instead represented in the system by document surrogates or metadata.'),
(16570, 76, '\n\nMost IR systems compute a numeric score on how well each object in the database matches the query, and rank the objects according to this value.'),
(16571, 76, 'The top ranking objects are then shown to the user.'),
(16572, 76, 'The process may then be iterated if the user wishes to refine the query.'),
(16573, 76, '[4] History[edit] But do you know that, although I have kept the diary [on a phonograph] for months past, it never once struck me how I was going to find any particular part of it in case I wanted to look it up?'),
(16574, 76, '\n\nDr Seward, Bram Stoker's Dracula, 1897 The idea of using computers to search for relevant pieces of information was popularized in the article As We May Think by Vannevar Bush in 1945.'),
(16575, 76, '[5] The first automated information retrieval systems were introduced in the 1950s and 1960s.'),
(16576, 76, 'By 1970 several different techniques had been shown to perform well on small text corpora such as the Cranfield collection (several thousand documents).'),
(16577, 76, '[5] Large-scale retrieval systems, such as the Lockheed Dialog system, came into use early in the 1970s.'),
(16578, 76, '\n\nIn 1992, the US Department of Defense along with the National Institute of Standards and Technology (NIST), cosponsored the Text Retrieval Conference (TREC) as part of the TIPSTER text program.'),
(16579, 76, 'The aim of this was to look into the information retrieval community by supplying the infrastructure that was needed for evaluation of text retrieval methodologies on a very large text collection.'),
(16580, 76, 'This catalyzed research on methods that scale to huge corpora.'),
(16581, 76, 'The introduction of web search engines has boosted the need for very large scale retrieval systems even further.'),
(16582, 76, '\n\nModel types[edit] Categorization of IR-models (translated from German entry, original source Dominik Kuropka).'),
(16583, 76, '\n\nFor effectively retrieving relevant documents by IR strategies, the documents are typically transformed into a suitable representation.'),
(16584, 76, 'Each retrieval strategy incorporate a specific model for its document representation purposes.'),
(16585, 76, 'The picture on the right illustrates the relationship of some common models.'),
(16586, 76, 'In the picture, the models are categorized according to two dimensions: the mathematical basis and the properties of the model.'),
(16587, 76, '\n\nFirst dimension: mathematical basis[edit] Set-theoretic models represent documents as sets of words or phrases.'),
(16588, 76, 'Similarities are usually derived from set-theoretic operations on those sets.'),
(16589, 76, 'Common models are: Standard Boolean model Extended Boolean model Fuzzy retrieval Algebraic models represent documents and queries usually as vectors, matrices, or tuples.'),
(16590, 76, 'The similarity of the query vector and document vector is represented as a scalar value.'),
(16591, 76, '\n\nVector space model Generalized vector space model (Enhanced) Topic-based Vector Space Model Extended Boolean model Latent semantic indexing aka latent semantic analysis Probabilistic models treat the process of document retrieval as a probabilistic inference.'),
(16592, 76, 'Similarities are computed as probabilities that a document is relevant for a given query.'),
(16593, 76, 'Probabilistic theorems like the Bayes' theorem are often used in these models.'),
(16594, 76, '\n\nBinary Independence Model Probabilistic relevance model on which is based the okapi (BM25) relevance function Uncertain inference Language models Divergence-from-randomness model Latent Dirichlet allocation Feature-based retrieval models view documents as vectors of values of feature functions (or just features) and seek the best way to combine these features into a single relevance score, typically by learning to rank methods.'),
(16595, 76, 'Feature functions are arbitrary functions of document and query, and as such can easily incorporate almost any other retrieval model as just a yet another feature.'),
(16596, 76, '\n\nSecond dimension: properties of the model[edit] Models without term-interdependencies treat different terms/words as independent.'),
(16597, 76, 'This fact is usually represented in vector space models by the orthogonality assumption of term vectors or in probabilistic models by an independency assumption for term variables.'),
(16598, 76, '\n\nModels with immanent term interdependencies allow a representation of interdependencies between terms.'),
(16599, 76, 'However the degree of the interdependency between two terms is defined by the model itself.'),
(16600, 76, 'It is usually directly or indirectly derived (e.g. by dimensional reduction) from the co-occurrence of those terms in the whole set of documents.'),
(16601, 76, '\n\nModels with transcendent term interdependencies allow a representation of interdependencies between terms, but they do not allege how the interdependency between two terms is defined.'),
(16602, 76, 'They relay an external source for the degree of interdependency between two terms.'),
(16603, 76, '(For example a human or sophisticated algorithms.)'),
(16604, 76, '\n\nPerformance and correctness measures[edit] Main article: Precision and recall Many different measures for evaluating the performance of information retrieval systems have been proposed.'),
(16605, 76, 'The measures require a collection of documents and a query.'),
(16606, 76, 'All common measures described here assume a ground truth notion of relevancy: every document is known to be either relevant or non-relevant to a particular query.'),
(16607, 76, 'In practice queries may be ill-posed and there may be different shades of relevancy.'),
(16608, 76, '\n\nPrecision[edit] Precision is the fraction of the documents retrieved that are relevant to the user's information need.'),
(16609, 76, '\n\n \\mbox{precision}=\\frac{|\\{\\mbox{relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{retrieved documents}\\}|} In binary classification, precision is analogous to positive predictive value.'),
(16610, 76, 'Precision takes all retrieved documents into account.'),
(16611, 76, 'It can also be evaluated at a given cut-off rank, considering only the topmost results returned by the system.'),
(16612, 76, 'This measure is called precision at n or P@n.'),
(16613, 76, '\n\nNote that the meaning and usage of "precision" in the field of Information Retrieval differs from the definition of accuracy and precision within other branches of science and statistics.'),
(16614, 76, '\n\nRecall[edit] Recall is the fraction of the documents that are relevant to the query that are successfully retrieved.'),
(16615, 76, '\n\n\\mbox{recall}=\\frac{|\\{\\mbox{relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{relevant documents}\\}|} In binary classification, recall is often called sensitivity.'),
(16616, 76, 'So it can be looked at as the probability that a relevant document is retrieved by the query.'),
(16617, 76, '\n\nIt is trivial to achieve recall of 100% by returning all documents in response to any query.'),
(16618, 76, 'Therefore recall alone is not enough but one needs to measure the number of non-relevant documents also, for example by computing the precision.'),
(16619, 76, '\n\nFall-out[edit] The proportion of non-relevant documents that are retrieved, out of all non-relevant documents available: \\mbox{fall-out}=\\frac{|\\{\\mbox{non-relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{non-relevant documents}\\}|} In binary classification, fall-out is closely related to specificity and is equal to (1-\\mbox{specificity}).'),
(16620, 76, 'It can be looked at as the probability that a non-relevant document is retrieved by the query.'),
(16621, 76, '\n\nIt is trivial to achieve fall-out of 0% by returning zero documents in response to any query.'),
(16622, 76, '\n\nF-measure[edit] Main article: F-score The weighted harmonic mean of precision and recall, the traditional F-measure or balanced F-score is: F = \\frac{2 \\cdot \\mathrm{precision} \\cdot \\mathrm{recall}}{(\\mathrm{precision} + \\mathrm{recall})}.'),
(16623, 76, '\\, This is also known as the F_1 measure, because recall and precision are evenly weighted.'),
(16624, 76, '\n\nThe general formula for non-negative real \\beta is: F_\\beta = \\frac{(1 + \\beta^2) \\cdot (\\mathrm{precision} \\cdot \\mathrm{recall})}{(\\beta^2 \\cdot \\mathrm{precision} + \\mathrm{recall})}\\,.'),
(16625, 76, '\n\nTwo other commonly used F measures are the F_{2} measure, which weights recall twice as much as precision, and the F_{0.5} measure, which weights precision twice as much as recall.'),
(16626, 76, '\n\nThe F-measure was derived by van Rijsbergen (1979) so that F_\\beta "measures the effectiveness of retrieval with respect to a user who attaches \\beta times as much importance to recall as precision".'),
(16627, 76, 'It is based on van Rijsbergen's effectiveness measure E = 1 - \\frac{1}{\\frac{\\alpha}{P} + \\frac{1-\\alpha}{R}}.'),
(16628, 76, 'Their relationship is F_\\beta = 1 - E where \\alpha=\\frac{1}{1 + \\beta^2}.'),
(16629, 76, '\n\nAverage precision[edit] Precision and recall are single-value metrics based on the whole list of documents returned by the system.'),
(16630, 76, 'For systems that return a ranked sequence of documents, it is desirable to also consider the order in which the returned documents are presented.'),
(16631, 76, 'By computing a precision and recall at every position in the ranked sequence of documents, one can plot a precision-recall curve, plotting precision p(r) as a function of recall r. Average precision computes the average value of p(r) over the interval from r=0 to r=1:[6] \\operatorname{AveP} = \\int_0^1 p(r)dr That is the area under the precision-recall curve.'),
(16632, 76, 'This integral is in practice replaced with a finite sum over every position in the ranked sequence of documents: \\operatorname{AveP} = \\sum_{k=1}^n P(k) \\Delta r(k) where k is the rank in the sequence of retrieved documents, n is the number of retrieved documents, P(k) is the precision at cut-off k in the list, and \\Delta r(k) is the change in recall from items k-1 to k.[6] This finite sum is equivalent to: \\operatorname{AveP} = \\frac{\\sum_{k=1}^n (P(k) \\times \\operatorname{rel}(k))}{\\mbox{number of relevant documents}} \\!'),
(16633, 76, '\n\nwhere \\operatorname{rel}(k) is an indicator function equaling 1 if the item at rank k is a relevant document, zero otherwise.'),
(16634, 76, '[7] Note that the average is over all relevant documents and the relevant documents not retrieved get a precision score of zero.'),
(16635, 76, '\n\nSome authors choose to interpolate the p(r) function to reduce the impact of "wiggles" in the curve.'),
(16636, 76, '[8][9] For example, the PASCAL Visual Object Classes challenge (a benchmark for computer vision object detection) computes average precision by averaging the precision over a set of evenly spaced recall levels {0, 0.1, 0.2, ... 1.0}:[8][9] \\operatorname{AveP} = \\frac{1}{11} \\sum_{r \\in \\{0, 0.1, \\ldots, 1.0\\}} p_{\\operatorname{interp}}(r) where p_{\\operatorname{interp}}(r) is an interpolated precision that takes the maximum precision over all recalls greater than r: p_{\\operatorname{interp}}(r) = \\operatorname{max}_{\\tilde{r}:\\tilde{r} \\geq r} p(\\tilde{r}).'),
(16637, 76, '\n\nAn alternative is to derive an analytical p(r) function by assuming a particular parametric distribution for the underlying decision values.'),
(16638, 76, 'For example, a binormal precision-recall curve can be obtained by assuming decision values in both classes to follow a Gaussian distribution.'),
(16639, 76, '[10] R-Precision[edit] Precision at R-th position in the ranking of results for a query that has R relevant documents.'),
(16640, 76, 'This measure is highly correlated to Average Precision.'),
(16641, 76, 'Also, Precision is equal to Recall at the R-th position.'),
(16642, 76, '\n\nMean average precision[edit] Mean average precision for a set of queries is the mean of the average precision scores for each query.'),
(16643, 76, '\n\n \\operatorname{MAP} = \\frac{\\sum_{q=1}^Q \\operatorname{AveP(q)}}{Q} \\!'),
(16644, 76, '\n\nwhere Q is the number of queries.'),
(16645, 76, '\n\nDiscounted cumulative gain[edit] Main article: Discounted cumulative gain DCG uses a graded relevance scale of documents from the result set to evaluate the usefulness, or gain, of a document based on its position in the result list.'),
(16646, 76, 'The premise of DCG is that highly relevant documents appearing lower in a search result list should be penalized as the graded relevance value is reduced logarithmically proportional to the position of the result.'),
(16647, 76, '\n\nThe DCG accumulated at a particular rank position p is defined as: \\mathrm{DCG_{p}} = rel_{1} + \\sum_{i=2}^{p} \\frac{rel_{i}}{\\log_{2}i}.'),
(16648, 76, '\n\nSince result set may vary in size among different queries or systems, to compare performances the normalised version of DCG uses an ideal DCG.'),
(16649, 76, 'To this end, it sorts documents of a result list by relevance, producing an ideal DCG at position p (IDCG_p), which normalizes the score: \\mathrm{nDCG_{p}} = \\frac{DCG_{p}}{IDCG{p}}.'),
(16650, 76, '\n\nThe nDCG values for all queries can be averaged to obtain a measure of the average performance of a ranking algorithm.'),
(16651, 76, 'Note that in a perfect ranking algorithm, the DCG_p will be the same as the IDCG_p producing an nDCG of 1.0.'),
(16652, 76, 'All nDCG calculations are then relative values on the interval 0.0 to 1.0 and so are cross-query comparable.'),
(16653, 76, '\n\nOther Measures[edit] Mean reciprocal rank Spearman's rank correlation coefficient Timeline[edit] Before the 1900s 1801: Joseph Marie Jacquard invents the Jacquard loom, the first machine to use punched cards to control a sequence of operations.'),
(16654, 76, '\n\n1880s: Herman Hollerith invents an electro-mechanical data tabulator using punch cards as a machine readable medium.'),
(16655, 76, '\n\n1890 Hollerith cards, keypunches and tabulators used to process the 1890 US Census data.'),
(16656, 76, '\n\n1920s-1930s Emanuel Goldberg submits patents for his "Statistical Machine a document search engine that used photoelectric cells and pattern recognition to search the metadata on rolls of microfilmed documents.'),
(16657, 76, '\n\n1940s1950s late 1940s: The US military confronted problems of indexing and retrieval of wartime scientific research documents captured from Germans.'),
(16658, 76, '\n\n1945: Vannevar Bush's As We May Think appeared in Atlantic Monthly.'),
(16659, 76, '\n\n1947: Hans Peter Luhn (research engineer at IBM since 1941) began work on a mechanized punch card-based system for searching chemical compounds.'),
(16660, 76, '\n\n1950s: Growing concern in the US for a "science gap" with the USSR motivated, encouraged funding and provided a backdrop for mechanized literature searching systems (Allen Kent et al.) and the invention of citation indexing (Eugene Garfield).'),
(16661, 76, '\n\n1950: The term "information retrieval" appears to have been coined by Calvin Mooers.'),
(16662, 76, '[11] 1951: Philip Bagley conducted the earliest experiment in computerized document retrieval in a master thesis at MIT.'),
(16663, 76, '[12] 1955: Allen Kent joined Case Western Reserve University, and eventually became associate director of the Center for Documentation and Communications Research.'),
(16664, 76, 'That same year, Kent and colleagues published a paper in American Documentation describing the precision and recall measures as well as detailing a proposed "framework" for evaluating an IR system which included statistical sampling methods for determining the number of relevant documents not retrieved.'),
(16665, 76, '\n\n1958: International Conference on Scientific Information Washington DC included consideration of IR systems as a solution to problems identified.'),
(16666, 76, 'See: Proceedings of the International Conference on Scientific Information, 1958 (National Academy of Sciences, Washington, DC, 1959) 1959: Hans Peter Luhn published "Auto-encoding of documents for information retrieval.'),
(16667, 76, '" 1960s: early 1960s: Gerard Salton began work on IR at Harvard, later moved to Cornell.'),
(16668, 76, '\n\n1960: Melvin Earl Maron and John Lary Kuhns[13] published "On relevance, probabilistic indexing, and information retrieval" in the Journal of the ACM 7(3):216244, July 1960.'),
(16669, 76, '\n\n1962: Cyril W. Cleverdon published early findings of the Cranfield studies, developing a model for IR system evaluation.'),
(16670, 76, 'See: Cyril W. Cleverdon, "Report on the Testing and Analysis of an Investigation into the Comparative Efficiency of Indexing Systems".'),
(16671, 76, 'Cranfield Collection of Aeronautics, Cranfield, England, 1962.'),
(16672, 76, '\n\nKent published Information Analysis and Retrieval.'),
(16673, 76, '\n\n1963: Weinberg report "Science, Government and Information" gave a full articulation of the idea of a "crisis of scientific information.'),
(16674, 76, '" The report was named after Dr. Alvin Weinberg.'),
(16675, 76, '\n\nJoseph Becker and Robert M. Hayes published text on information retrieval.'),
(16676, 76, 'Becker, Joseph; Hayes, Robert Mayo.'),
(16677, 76, 'Information storage and retrieval: tools, elements, theories.'),
(16678, 76, 'New York, Wiley (1963).'),
(16679, 76, '\n\n1964: Karen Sprck Jones finished her thesis at Cambridge, Synonymy and Semantic Classification, and continued work on computational linguistics as it applies to IR.'),
(16680, 76, '\n\nThe National Bureau of Standards sponsored a symposium titled "Statistical Association Methods for Mechanized Documentation.'),
(16681, 76, '" Several highly significant papers, including G. Salton's first published reference (we believe) to the SMART system.'),
(16682, 76, '\n\nmid-1960s: National Library of Medicine developed MEDLARS Medical Literature Analysis and Retrieval System, the first major machine-readable database and batch-retrieval system.'),
(16683, 76, '\n\nProject Intrex at MIT.'),
(16684, 76, '\n\n1965: J. C. R. Licklider published Libraries of the Future.'),
(16685, 76, '\n\n1966: Don Swanson was involved in studies at University of Chicago on Requirements for Future Catalogs.'),
(16686, 76, '\n\nlate 1960s: F. Wilfrid Lancaster completed evaluation studies of the MEDLARS system and published the first edition of his text on information retrieval.'),
(16687, 76, '\n\n1968: Gerard Salton published Automatic Information Organization and Retrieval.'),
(16688, 76, '\n\nJohn W. Sammon, Jr.'s RADC Tech report "Some Mathematics of Information Storage and Retrieval..." outlined the vector model.'),
(16689, 76, '\n\n1969: Sammon's "A nonlinear mapping for data structure analysis" (IEEE Transactions on Computers) was the first proposal for visualization interface to an IR system.'),
(16690, 76, '\n\n1970s early 1970s: First online systemsNLM's AIM-TWX, MEDLINE; Lockheed's Dialog; SDC's ORBIT.'),
(16691, 76, '\n\nTheodor Nelson promoting concept of hypertext, published Computer Lib/Dream Machines.'),
(16692, 76, '\n\n1971: Nicholas Jardine and Cornelis J. van Rijsbergen published "The use of hierarchic clustering in information retrieval", which articulated the "cluster hypothesis.'),
(16693, 76, '"[14] 1975: Three highly influential publications by Salton fully articulated his vector processing framework and term discrimination model: A Theory of Indexing (Society for Industrial and Applied Mathematics) A Theory of Term Importance in Automatic Text Analysis (JASIS v. 26) A Vector Space Model for Automatic Indexing (CACM 18:11) 1978: The First ACM SIGIR conference.'),
(16694, 76, '\n\n1979: C. J. van Rijsbergen published Information Retrieval (Butterworths).'),
(16695, 76, 'Heavy emphasis on probabilistic models.'),
(16696, 76, '\n\n1980s 1980: First international ACM SIGIR conference, joint with British Computer Society IR group in Cambridge.'),
(16697, 76, '\n\n1982: Nicholas J. Belkin, Robert N. Oddy, and Helen M. Brooks proposed the ASK (Anomalous State of Knowledge) viewpoint for information retrieval.'),
(16698, 76, 'This was an important concept, though their automated analysis tool proved ultimately disappointing.'),
(16699, 76, '\n\n1983: Salton (and Michael J. McGill) published Introduction to Modern Information Retrieval (McGraw-Hill), with heavy emphasis on vector models.'),
(16700, 76, '\n\n1985: David Blair and Bill Maron publish: An Evaluation of Retrieval Effectiveness for a Full-Text Document-Retrieval System mid-1980s: Efforts to develop end-user versions of commercial IR systems.'),
(16701, 76, '\n\n19851993: Key papers on and experimental systems for visualization interfaces.'),
(16702, 76, '\n\nWork by Donald B. Crouch, Robert R. Korfhage, Matthew Chalmers, Anselm Spoerri and others.'),
(16703, 76, '\n\n1989: First World Wide Web proposals by Tim Berners-Lee at CERN.'),
(16704, 76, '\n\n1990s 1992: First TREC conference.'),
(16705, 76, '\n\n1997: Publication of Korfhage's Information Storage and Retrieval[15] with emphasis on visualization and multi-reference point systems.'),
(16706, 76, '\n\nlate 1990s: Web search engines implementation of many features formerly found only in experimental IR systems.'),
(16707, 76, 'Search engines become the most common and maybe best instantiation of IR models, research, and implement by Jocanz.'),
(16708, 76, '\n\nAwards in the field[edit] Tony Kent Strix award Gerard Salton Award See also[edit] Adversarial information retrieval Collaborative information seeking Controlled vocabulary Cross-language information retrieval Data mining European Summer School in Information Retrieval Humancomputer information retrieval Information extraction Information Retrieval Facility Knowledge visualization Multimedia Information Retrieval List of information retrieval libraries Personal information management Relevance (Information Retrieval) Relevance feedback Rocchio Classification Search index Social information seeking Special Interest Group on Information Retrieval Structured Search Subject indexing Temporal information retrieval Tf-idf XML-Retrieval Key-objects References[edit] Wikiquote has a collection of quotations related to: Information retrieval ACM SIGIR: Information Retrieval Special Interest Group BCS IRSG: British Computer Society - Information Retrieval Specialist Group Text Retrieval Conference (TREC) Forum for Information Retrieval Evaluation (FIRE) Information Retrieval (online book) by C. J. van Rijsbergen Information Retrieval Wiki Information Retrieval Facility Information Retrieval @ DUTH Introduction to Information Retrieval (online book) by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schtze, Cambridge University Press.'),
(16709, 76, '2008.'),
(16710, 76, '\n\nCategories: Information retrievalNatural language processing Navigation menu Create accountLog inArticleTalkReadEditView history Edit links This page was last modified on 17 February 2014 at 05:19.'),
(16711, 76, '\n\nText is available under the Creative Commons Attribution-ShareAlike License; additional terms may apply.'),
(16712, 76, 'By using this site, you agree to the Terms of Use and Privacy Policy.'),
(16713, 76, '\n\nWikipedia is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.'),
(16714, 76, '\n\nPrivacy policyAbout WikipediaDisclaimersContact WikipediaDevelopersMobile viewWikimedia Foundation Powered by MediaWiki'),
(16715, 78, 'Information retrieval is the activity of obtaining information resources relevant to an information need from a collection of information resources.'),
(16716, 78, 'Searches can be based on metadata or on full-text (or other content-based) indexing.'),
(16717, 78, '\n\nAutomated information retrieval systems are used to reduce what has been called "information overload".'),
(16718, 78, 'Many universities and public libraries use IR systems to provide access to books, journals and other documents.'),
(16719, 78, 'Web search engines are the most visible IR applications.'),
(16720, 78, '\n\nContents [hide] 1 Overview 2 History 3 Model types 3.1 First dimension: mathematical basis 3.2 Second dimension: properties of the model 4 Performance and correctness measures 4.1 Precision 4.2 Recall 4.3 Fall-out 4.4 F-measure 4.5 Average precision 4.6 R-Precision 4.7 Mean average precision 4.8 Discounted cumulative gain 4.9 Other Measures 4.10 Timeline 5 Awards in the field 6 See also 7 References 8 External links Overview[edit] An information retrieval process begins when a user enters a query into the system.'),
(16721, 78, 'Queries are formal statements of information needs, for example search strings in web search engines.'),
(16722, 78, 'In information retrieval a query does not uniquely identify a single object in the collection.'),
(16723, 78, 'Instead, several objects may match the query, perhaps with different degrees of relevancy.'),
(16724, 78, '\n\nAn object is an entity that is represented by information in a database.'),
(16725, 78, 'User queries are matched against the database information.'),
(16726, 78, 'Depending on the application the data objects may be, for example, text documents, images,[1] audio,[2] mind maps[3] or videos.'),
(16727, 78, 'Often the documents themselves are not kept or stored directly in the IR system, but are instead represented in the system by document surrogates or metadata.'),
(16728, 78, '\n\nMost IR systems compute a numeric score on how well each object in the database matches the query, and rank the objects according to this value.'),
(16729, 78, 'The top ranking objects are then shown to the user.'),
(16730, 78, 'The process may then be iterated if the user wishes to refine the query.'),
(16731, 78, '[4] History[edit] But do you know that, although I have kept the diary [on a phonograph] for months past, it never once struck me how I was going to find any particular part of it in case I wanted to look it up?'),
(16732, 78, '\n\nDr Seward, Bram Stoker's Dracula, 1897 The idea of using computers to search for relevant pieces of information was popularized in the article As We May Think by Vannevar Bush in 1945.'),
(16733, 78, '[5] The first automated information retrieval systems were introduced in the 1950s and 1960s.'),
(16734, 78, 'By 1970 several different techniques had been shown to perform well on small text corpora such as the Cranfield collection (several thousand documents).'),
(16735, 78, '[5] Large-scale retrieval systems, such as the Lockheed Dialog system, came into use early in the 1970s.'),
(16736, 78, '\n\nIn 1992, the US Department of Defense along with the National Institute of Standards and Technology (NIST), cosponsored the Text Retrieval Conference (TREC) as part of the TIPSTER text program.'),
(16737, 78, 'The aim of this was to look into the information retrieval community by supplying the infrastructure that was needed for evaluation of text retrieval methodologies on a very large text collection.'),
(16738, 78, 'This catalyzed research on methods that scale to huge corpora.'),
(16739, 78, 'The introduction of web search engines has boosted the need for very large scale retrieval systems even further.'),
(16740, 78, '\n\nModel types[edit] Categorization of IR-models (translated from German entry, original source Dominik Kuropka).'),
(16741, 78, '\n\nFor effectively retrieving relevant documents by IR strategies, the documents are typically transformed into a suitable representation.'),
(16742, 78, 'Each retrieval strategy incorporate a specific model for its document representation purposes.'),
(16743, 78, 'The picture on the right illustrates the relationship of some common models.'),
(16744, 78, 'In the picture, the models are categorized according to two dimensions: the mathematical basis and the properties of the model.'),
(16745, 78, '\n\nFirst dimension: mathematical basis[edit] Set-theoretic models represent documents as sets of words or phrases.'),
(16746, 78, 'Similarities are usually derived from set-theoretic operations on those sets.'),
(16747, 78, 'Common models are: Standard Boolean model Extended Boolean model Fuzzy retrieval Algebraic models represent documents and queries usually as vectors, matrices, or tuples.'),
(16748, 78, 'The similarity of the query vector and document vector is represented as a scalar value.'),
(16749, 78, '\n\nVector space model Generalized vector space model (Enhanced) Topic-based Vector Space Model Extended Boolean model Latent semantic indexing aka latent semantic analysis Probabilistic models treat the process of document retrieval as a probabilistic inference.'),
(16750, 78, 'Similarities are computed as probabilities that a document is relevant for a given query.'),
(16751, 78, 'Probabilistic theorems like the Bayes' theorem are often used in these models.'),
(16752, 78, '\n\nBinary Independence Model Probabilistic relevance model on which is based the okapi (BM25) relevance function Uncertain inference Language models Divergence-from-randomness model Latent Dirichlet allocation Feature-based retrieval models view documents as vectors of values of feature functions (or just features) and seek the best way to combine these features into a single relevance score, typically by learning to rank methods.'),
(16753, 78, 'Feature functions are arbitrary functions of document and query, and as such can easily incorporate almost any other retrieval model as just a yet another feature.'),
(16754, 78, '\n\nSecond dimension: properties of the model[edit] Models without term-interdependencies treat different terms/words as independent.'),
(16755, 78, 'This fact is usually represented in vector space models by the orthogonality assumption of term vectors or in probabilistic models by an independency assumption for term variables.'),
(16756, 78, '\n\nModels with immanent term interdependencies allow a representation of interdependencies between terms.'),
(16757, 78, 'However the degree of the interdependency between two terms is defined by the model itself.'),
(16758, 78, 'It is usually directly or indirectly derived (e.g. by dimensional reduction) from the co-occurrence of those terms in the whole set of documents.'),
(16759, 78, '\n\nModels with transcendent term interdependencies allow a representation of interdependencies between terms, but they do not allege how the interdependency between two terms is defined.'),
(16760, 78, 'They relay an external source for the degree of interdependency between two terms.'),
(16761, 78, '(For example a human or sophisticated algorithms.)'),
(16762, 78, '\n\nPerformance and correctness measures[edit] Main article: Precision and recall Many different measures for evaluating the performance of information retrieval systems have been proposed.'),
(16763, 78, 'The measures require a collection of documents and a query.'),
(16764, 78, 'All common measures described here assume a ground truth notion of relevancy: every document is known to be either relevant or non-relevant to a particular query.'),
(16765, 78, 'In practice queries may be ill-posed and there may be different shades of relevancy.'),
(16766, 78, '\n\nPrecision[edit] Precision is the fraction of the documents retrieved that are relevant to the user's information need.'),
(16767, 78, '\n\n \\mbox{precision}=\\frac{|\\{\\mbox{relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{retrieved documents}\\}|} In binary classification, precision is analogous to positive predictive value.'),
(16768, 78, 'Precision takes all retrieved documents into account.'),
(16769, 78, 'It can also be evaluated at a given cut-off rank, considering only the topmost results returned by the system.'),
(16770, 78, 'This measure is called precision at n or P@n.'),
(16771, 78, '\n\nNote that the meaning and usage of "precision" in the field of Information Retrieval differs from the definition of accuracy and precision within other branches of science and statistics.'),
(16772, 78, '\n\nRecall[edit] Recall is the fraction of the documents that are relevant to the query that are successfully retrieved.'),
(16773, 78, '\n\n\\mbox{recall}=\\frac{|\\{\\mbox{relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{relevant documents}\\}|} In binary classification, recall is often called sensitivity.'),
(16774, 78, 'So it can be looked at as the probability that a relevant document is retrieved by the query.'),
(16775, 78, '\n\nIt is trivial to achieve recall of 100% by returning all documents in response to any query.'),
(16776, 78, 'Therefore recall alone is not enough but one needs to measure the number of non-relevant documents also, for example by computing the precision.'),
(16777, 78, '\n\nFall-out[edit] The proportion of non-relevant documents that are retrieved, out of all non-relevant documents available: \\mbox{fall-out}=\\frac{|\\{\\mbox{non-relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{non-relevant documents}\\}|} In binary classification, fall-out is closely related to specificity and is equal to (1-\\mbox{specificity}).'),
(16778, 78, 'It can be looked at as the probability that a non-relevant document is retrieved by the query.'),
(16779, 78, '\n\nIt is trivial to achieve fall-out of 0% by returning zero documents in response to any query.'),
(16780, 78, '\n\nF-measure[edit] Main article: F-score The weighted harmonic mean of precision and recall, the traditional F-measure or balanced F-score is: F = \\frac{2 \\cdot \\mathrm{precision} \\cdot \\mathrm{recall}}{(\\mathrm{precision} + \\mathrm{recall})}.'),
(16781, 78, '\\, This is also known as the F_1 measure, because recall and precision are evenly weighted.'),
(16782, 78, '\n\nThe general formula for non-negative real \\beta is: F_\\beta = \\frac{(1 + \\beta^2) \\cdot (\\mathrm{precision} \\cdot \\mathrm{recall})}{(\\beta^2 \\cdot \\mathrm{precision} + \\mathrm{recall})}\\,.'),
(16783, 78, '\n\nTwo other commonly used F measures are the F_{2} measure, which weights recall twice as much as precision, and the F_{0.5} measure, which weights precision twice as much as recall.'),
(16784, 78, '\n\nThe F-measure was derived by van Rijsbergen (1979) so that F_\\beta "measures the effectiveness of retrieval with respect to a user who attaches \\beta times as much importance to recall as precision".'),
(16785, 78, 'It is based on van Rijsbergen's effectiveness measure E = 1 - \\frac{1}{\\frac{\\alpha}{P} + \\frac{1-\\alpha}{R}}.'),
(16786, 78, 'Their relationship is F_\\beta = 1 - E where \\alpha=\\frac{1}{1 + \\beta^2}.'),
(16787, 78, '\n\nAverage precision[edit] Precision and recall are single-value metrics based on the whole list of documents returned by the system.'),
(16788, 78, 'For systems that return a ranked sequence of documents, it is desirable to also consider the order in which the returned documents are presented.'),
(16789, 78, 'By computing a precision and recall at every position in the ranked sequence of documents, one can plot a precision-recall curve, plotting precision p(r) as a function of recall r. Average precision computes the average value of p(r) over the interval from r=0 to r=1:[6] \\operatorname{AveP} = \\int_0^1 p(r)dr That is the area under the precision-recall curve.'),
(16790, 78, 'This integral is in practice replaced with a finite sum over every position in the ranked sequence of documents: \\operatorname{AveP} = \\sum_{k=1}^n P(k) \\Delta r(k) where k is the rank in the sequence of retrieved documents, n is the number of retrieved documents, P(k) is the precision at cut-off k in the list, and \\Delta r(k) is the change in recall from items k-1 to k.[6] This finite sum is equivalent to: \\operatorname{AveP} = \\frac{\\sum_{k=1}^n (P(k) \\times \\operatorname{rel}(k))}{\\mbox{number of relevant documents}} \\!'),
(16791, 78, '\n\nwhere \\operatorname{rel}(k) is an indicator function equaling 1 if the item at rank k is a relevant document, zero otherwise.'),
(16792, 78, '[7] Note that the average is over all relevant documents and the relevant documents not retrieved get a precision score of zero.'),
(16793, 78, '\n\nSome authors choose to interpolate the p(r) function to reduce the impact of "wiggles" in the curve.'),
(16794, 78, '[8][9] For example, the PASCAL Visual Object Classes challenge (a benchmark for computer vision object detection) computes average precision by averaging the precision over a set of evenly spaced recall levels {0, 0.1, 0.2, ... 1.0}:[8][9] \\operatorname{AveP} = \\frac{1}{11} \\sum_{r \\in \\{0, 0.1, \\ldots, 1.0\\}} p_{\\operatorname{interp}}(r) where p_{\\operatorname{interp}}(r) is an interpolated precision that takes the maximum precision over all recalls greater than r: p_{\\operatorname{interp}}(r) = \\operatorname{max}_{\\tilde{r}:\\tilde{r} \\geq r} p(\\tilde{r}).'),
(16795, 78, '\n\nAn alternative is to derive an analytical p(r) function by assuming a particular parametric distribution for the underlying decision values.'),
(16796, 78, 'For example, a binormal precision-recall curve can be obtained by assuming decision values in both classes to follow a Gaussian distribution.'),
(16797, 78, '[10] R-Precision[edit] Precision at R-th position in the ranking of results for a query that has R relevant documents.'),
(16798, 78, 'This measure is highly correlated to Average Precision.'),
(16799, 78, 'Also, Precision is equal to Recall at the R-th position.'),
(16800, 78, '\n\nMean average precision[edit] Mean average precision for a set of queries is the mean of the average precision scores for each query.'),
(16801, 78, '\n\n \\operatorname{MAP} = \\frac{\\sum_{q=1}^Q \\operatorname{AveP(q)}}{Q} \\!'),
(16802, 78, '\n\nwhere Q is the number of queries.'),
(16803, 78, '\n\nDiscounted cumulative gain[edit] Main article: Discounted cumulative gain DCG uses a graded relevance scale of documents from the result set to evaluate the usefulness, or gain, of a document based on its position in the result list.'),
(16804, 78, 'The premise of DCG is that highly relevant documents appearing lower in a search result list should be penalized as the graded relevance value is reduced logarithmically proportional to the position of the result.'),
(16805, 78, '\n\nThe DCG accumulated at a particular rank position p is defined as: \\mathrm{DCG_{p}} = rel_{1} + \\sum_{i=2}^{p} \\frac{rel_{i}}{\\log_{2}i}.'),
(16806, 78, '\n\nSince result set may vary in size among different queries or systems, to compare performances the normalised version of DCG uses an ideal DCG.'),
(16807, 78, 'To this end, it sorts documents of a result list by relevance, producing an ideal DCG at position p (IDCG_p), which normalizes the score: \\mathrm{nDCG_{p}} = \\frac{DCG_{p}}{IDCG{p}}.'),
(16808, 78, '\n\nThe nDCG values for all queries can be averaged to obtain a measure of the average performance of a ranking algorithm.'),
(16809, 78, 'Note that in a perfect ranking algorithm, the DCG_p will be the same as the IDCG_p producing an nDCG of 1.0.'),
(16810, 78, 'All nDCG calculations are then relative values on the interval 0.0 to 1.0 and so are cross-query comparable.'),
(16811, 78, '\n\nOther Measures[edit] Mean reciprocal rank Spearman's rank correlation coefficient Timeline[edit] Before the 1900s 1801: Joseph Marie Jacquard invents the Jacquard loom, the first machine to use punched cards to control a sequence of operations.'),
(16812, 78, '\n\n1880s: Herman Hollerith invents an electro-mechanical data tabulator using punch cards as a machine readable medium.'),
(16813, 78, '\n\n1890 Hollerith cards, keypunches and tabulators used to process the 1890 US Census data.'),
(16814, 78, '\n\n1920s-1930s Emanuel Goldberg submits patents for his "Statistical Machine a document search engine that used photoelectric cells and pattern recognition to search the metadata on rolls of microfilmed documents.'),
(16815, 78, '\n\n1940s1950s late 1940s: The US military confronted problems of indexing and retrieval of wartime scientific research documents captured from Germans.'),
(16816, 78, '\n\n1945: Vannevar Bush's As We May Think appeared in Atlantic Monthly.');
INSERT INTO `sentences` (`id`, `document_id`, `sentence`) VALUES
(16817, 78, '\n\n1947: Hans Peter Luhn (research engineer at IBM since 1941) began work on a mechanized punch card-based system for searching chemical compounds.'),
(16818, 78, '\n\n1950s: Growing concern in the US for a "science gap" with the USSR motivated, encouraged funding and provided a backdrop for mechanized literature searching systems (Allen Kent et al.) and the invention of citation indexing (Eugene Garfield).'),
(16819, 78, '\n\n1950: The term "information retrieval" appears to have been coined by Calvin Mooers.'),
(16820, 78, '[11] 1951: Philip Bagley conducted the earliest experiment in computerized document retrieval in a master thesis at MIT.'),
(16821, 78, '[12] 1955: Allen Kent joined Case Western Reserve University, and eventually became associate director of the Center for Documentation and Communications Research.'),
(16822, 78, 'That same year, Kent and colleagues published a paper in American Documentation describing the precision and recall measures as well as detailing a proposed "framework" for evaluating an IR system which included statistical sampling methods for determining the number of relevant documents not retrieved.'),
(16823, 78, '\n\n1958: International Conference on Scientific Information Washington DC included consideration of IR systems as a solution to problems identified.'),
(16824, 78, 'See: Proceedings of the International Conference on Scientific Information, 1958 (National Academy of Sciences, Washington, DC, 1959) 1959: Hans Peter Luhn published "Auto-encoding of documents for information retrieval.'),
(16825, 78, '" 1960s: early 1960s: Gerard Salton began work on IR at Harvard, later moved to Cornell.'),
(16826, 78, '\n\n1960: Melvin Earl Maron and John Lary Kuhns[13] published "On relevance, probabilistic indexing, and information retrieval" in the Journal of the ACM 7(3):216244, July 1960.'),
(16827, 78, '\n\n1962: Cyril W. Cleverdon published early findings of the Cranfield studies, developing a model for IR system evaluation.'),
(16828, 78, 'See: Cyril W. Cleverdon, "Report on the Testing and Analysis of an Investigation into the Comparative Efficiency of Indexing Systems".'),
(16829, 78, 'Cranfield Collection of Aeronautics, Cranfield, England, 1962.'),
(16830, 78, '\n\nKent published Information Analysis and Retrieval.'),
(16831, 78, '\n\n1963: Weinberg report "Science, Government and Information" gave a full articulation of the idea of a "crisis of scientific information.'),
(16832, 78, '" The report was named after Dr. Alvin Weinberg.'),
(16833, 78, '\n\nJoseph Becker and Robert M. Hayes published text on information retrieval.'),
(16834, 78, 'Becker, Joseph; Hayes, Robert Mayo.'),
(16835, 78, 'Information storage and retrieval: tools, elements, theories.'),
(16836, 78, 'New York, Wiley (1963).'),
(16837, 78, '\n\n1964: Karen Sprck Jones finished her thesis at Cambridge, Synonymy and Semantic Classification, and continued work on computational linguistics as it applies to IR.'),
(16838, 78, '\n\nThe National Bureau of Standards sponsored a symposium titled "Statistical Association Methods for Mechanized Documentation.'),
(16839, 78, '" Several highly significant papers, including G. Salton's first published reference (we believe) to the SMART system.'),
(16840, 78, '\n\nmid-1960s: National Library of Medicine developed MEDLARS Medical Literature Analysis and Retrieval System, the first major machine-readable database and batch-retrieval system.'),
(16841, 78, '\n\nProject Intrex at MIT.'),
(16842, 78, '\n\n1965: J. C. R. Licklider published Libraries of the Future.'),
(16843, 78, '\n\n1966: Don Swanson was involved in studies at University of Chicago on Requirements for Future Catalogs.'),
(16844, 78, '\n\nlate 1960s: F. Wilfrid Lancaster completed evaluation studies of the MEDLARS system and published the first edition of his text on information retrieval.'),
(16845, 78, '\n\n1968: Gerard Salton published Automatic Information Organization and Retrieval.'),
(16846, 78, '\n\nJohn W. Sammon, Jr.'s RADC Tech report "Some Mathematics of Information Storage and Retrieval..." outlined the vector model.'),
(16847, 78, '\n\n1969: Sammon's "A nonlinear mapping for data structure analysis" (IEEE Transactions on Computers) was the first proposal for visualization interface to an IR system.'),
(16848, 78, '\n\n1970s early 1970s: First online systemsNLM's AIM-TWX, MEDLINE; Lockheed's Dialog; SDC's ORBIT.'),
(16849, 78, '\n\nTheodor Nelson promoting concept of hypertext, published Computer Lib/Dream Machines.'),
(16850, 78, '\n\n1971: Nicholas Jardine and Cornelis J. van Rijsbergen published "The use of hierarchic clustering in information retrieval", which articulated the "cluster hypothesis.'),
(16851, 78, '"[14] 1975: Three highly influential publications by Salton fully articulated his vector processing framework and term discrimination model: A Theory of Indexing (Society for Industrial and Applied Mathematics) A Theory of Term Importance in Automatic Text Analysis (JASIS v. 26) A Vector Space Model for Automatic Indexing (CACM 18:11) 1978: The First ACM SIGIR conference.'),
(16852, 78, '\n\n1979: C. J. van Rijsbergen published Information Retrieval (Butterworths).'),
(16853, 78, 'Heavy emphasis on probabilistic models.'),
(16854, 78, '\n\n1980s 1980: First international ACM SIGIR conference, joint with British Computer Society IR group in Cambridge.'),
(16855, 78, '\n\n1982: Nicholas J. Belkin, Robert N. Oddy, and Helen M. Brooks proposed the ASK (Anomalous State of Knowledge) viewpoint for information retrieval.'),
(16856, 78, 'This was an important concept, though their automated analysis tool proved ultimately disappointing.'),
(16857, 78, '\n\n1983: Salton (and Michael J. McGill) published Introduction to Modern Information Retrieval (McGraw-Hill), with heavy emphasis on vector models.'),
(16858, 78, '\n\n1985: David Blair and Bill Maron publish: An Evaluation of Retrieval Effectiveness for a Full-Text Document-Retrieval System mid-1980s: Efforts to develop end-user versions of commercial IR systems.'),
(16859, 78, '\n\n19851993: Key papers on and experimental systems for visualization interfaces.'),
(16860, 78, '\n\nWork by Donald B. Crouch, Robert R. Korfhage, Matthew Chalmers, Anselm Spoerri and others.'),
(16861, 78, '\n\n1989: First World Wide Web proposals by Tim Berners-Lee at CERN.'),
(16862, 78, '\n\n1990s 1992: First TREC conference.'),
(16863, 78, '\n\n1997: Publication of Korfhage's Information Storage and Retrieval[15] with emphasis on visualization and multi-reference point systems.'),
(16864, 78, '\n\nlate 1990s: Web search engines implementation of many features formerly found only in experimental IR systems.'),
(16865, 78, 'Search engines become the most common and maybe best instantiation of IR models, research, and implement by Jocanz.'),
(16866, 78, '\n\nAwards in the field[edit] Tony Kent Strix award Gerard Salton Award See also[edit] Adversarial information retrieval Collaborative information seeking Controlled vocabulary Cross-language information retrieval Data mining European Summer School in Information Retrieval Humancomputer information retrieval Information extraction Information Retrieval Facility Knowledge visualization Multimedia Information Retrieval List of information retrieval libraries Personal information management Relevance (Information Retrieval) Relevance feedback Rocchio Classification Search index Social information seeking Special Interest Group on Information Retrieval Structured Search Subject indexing Temporal information retrieval Tf-idf XML-Retrieval Key-objects References[edit] Wikiquote has a collection of quotations related to: Information retrieval ACM SIGIR: Information Retrieval Special Interest Group BCS IRSG: British Computer Society - Information Retrieval Specialist Group Text Retrieval Conference (TREC) Forum for Information Retrieval Evaluation (FIRE) Information Retrieval (online book) by C. J. van Rijsbergen Information Retrieval Wiki Information Retrieval Facility Information Retrieval @ DUTH Introduction to Information Retrieval (online book) by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schtze, Cambridge University Press.'),
(16867, 78, '2008.'),
(16868, 78, '\n\nCategories: Information retrievalNatural language processing Navigation menu Create accountLog inArticleTalkReadEditView history Edit links This page was last modified on 17 February 2014 at 05:19.'),
(16869, 78, '\n\nText is available under the Creative Commons Attribution-ShareAlike License; additional terms may apply.'),
(16870, 78, 'By using this site, you agree to the Terms of Use and Privacy Policy.'),
(16871, 78, '\n\nWikipedia is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.'),
(16872, 78, '\n\nPrivacy policyAbout WikipediaDisclaimersContact WikipediaDevelopersMobile viewWikimedia Foundation Powered by MediaWiki'),
(16873, 79, 'Information retrieval is the activity of obtaining information resources relevant to an information need from a collection of information resources.'),
(16874, 79, 'Searches can be based on metadata or on full-text (or other content-based) indexing.'),
(16875, 79, '\n\nAutomated information retrieval systems are used to reduce what has been called "information overload".'),
(16876, 79, 'Many universities and public libraries use IR systems to provide access to books, journals and other documents.'),
(16877, 79, 'Web search engines are the most visible IR applications.'),
(16878, 79, '\n\nContents [hide] 1 Overview 2 History 3 Model types 3.1 First dimension: mathematical basis 3.2 Second dimension: properties of the model 4 Performance and correctness measures 4.1 Precision 4.2 Recall 4.3 Fall-out 4.4 F-measure 4.5 Average precision 4.6 R-Precision 4.7 Mean average precision 4.8 Discounted cumulative gain 4.9 Other Measures 4.10 Timeline 5 Awards in the field 6 See also 7 References 8 External links Overview[edit] An information retrieval process begins when a user enters a query into the system.'),
(16879, 79, 'Queries are formal statements of information needs, for example search strings in web search engines.'),
(16880, 79, 'In information retrieval a query does not uniquely identify a single object in the collection.'),
(16881, 79, 'Instead, several objects may match the query, perhaps with different degrees of relevancy.'),
(16882, 79, '\n\nAn object is an entity that is represented by information in a database.'),
(16883, 79, 'User queries are matched against the database information.'),
(16884, 79, 'Depending on the application the data objects may be, for example, text documents, images,[1] audio,[2] mind maps[3] or videos.'),
(16885, 79, 'Often the documents themselves are not kept or stored directly in the IR system, but are instead represented in the system by document surrogates or metadata.'),
(16886, 79, '\n\nMost IR systems compute a numeric score on how well each object in the database matches the query, and rank the objects according to this value.'),
(16887, 79, 'The top ranking objects are then shown to the user.'),
(16888, 79, 'The process may then be iterated if the user wishes to refine the query.'),
(16889, 79, '[4] History[edit] But do you know that, although I have kept the diary [on a phonograph] for months past, it never once struck me how I was going to find any particular part of it in case I wanted to look it up?'),
(16890, 79, '\n\nDr Seward, Bram Stoker's Dracula, 1897 The idea of using computers to search for relevant pieces of information was popularized in the article As We May Think by Vannevar Bush in 1945.'),
(16891, 79, '[5] The first automated information retrieval systems were introduced in the 1950s and 1960s.'),
(16892, 79, 'By 1970 several different techniques had been shown to perform well on small text corpora such as the Cranfield collection (several thousand documents).'),
(16893, 79, '[5] Large-scale retrieval systems, such as the Lockheed Dialog system, came into use early in the 1970s.'),
(16894, 79, '\n\nIn 1992, the US Department of Defense along with the National Institute of Standards and Technology (NIST), cosponsored the Text Retrieval Conference (TREC) as part of the TIPSTER text program.'),
(16895, 79, 'The aim of this was to look into the information retrieval community by supplying the infrastructure that was needed for evaluation of text retrieval methodologies on a very large text collection.'),
(16896, 79, 'This catalyzed research on methods that scale to huge corpora.'),
(16897, 79, 'The introduction of web search engines has boosted the need for very large scale retrieval systems even further.'),
(16898, 79, '\n\nModel types[edit] Categorization of IR-models (translated from German entry, original source Dominik Kuropka).'),
(16899, 79, '\n\nFor effectively retrieving relevant documents by IR strategies, the documents are typically transformed into a suitable representation.'),
(16900, 79, 'Each retrieval strategy incorporate a specific model for its document representation purposes.'),
(16901, 79, 'The picture on the right illustrates the relationship of some common models.'),
(16902, 79, 'In the picture, the models are categorized according to two dimensions: the mathematical basis and the properties of the model.'),
(16903, 79, '\n\nFirst dimension: mathematical basis[edit] Set-theoretic models represent documents as sets of words or phrases.'),
(16904, 79, 'Similarities are usually derived from set-theoretic operations on those sets.'),
(16905, 79, 'Common models are: Standard Boolean model Extended Boolean model Fuzzy retrieval Algebraic models represent documents and queries usually as vectors, matrices, or tuples.'),
(16906, 79, 'The similarity of the query vector and document vector is represented as a scalar value.'),
(16907, 79, '\n\nVector space model Generalized vector space model (Enhanced) Topic-based Vector Space Model Extended Boolean model Latent semantic indexing aka latent semantic analysis Probabilistic models treat the process of document retrieval as a probabilistic inference.'),
(16908, 79, 'Similarities are computed as probabilities that a document is relevant for a given query.'),
(16909, 79, 'Probabilistic theorems like the Bayes' theorem are often used in these models.'),
(16910, 79, '\n\nBinary Independence Model Probabilistic relevance model on which is based the okapi (BM25) relevance function Uncertain inference Language models Divergence-from-randomness model Latent Dirichlet allocation Feature-based retrieval models view documents as vectors of values of feature functions (or just features) and seek the best way to combine these features into a single relevance score, typically by learning to rank methods.'),
(16911, 79, 'Feature functions are arbitrary functions of document and query, and as such can easily incorporate almost any other retrieval model as just a yet another feature.'),
(16912, 79, '\n\nSecond dimension: properties of the model[edit] Models without term-interdependencies treat different terms/words as independent.'),
(16913, 79, 'This fact is usually represented in vector space models by the orthogonality assumption of term vectors or in probabilistic models by an independency assumption for term variables.'),
(16914, 79, '\n\nModels with immanent term interdependencies allow a representation of interdependencies between terms.'),
(16915, 79, 'However the degree of the interdependency between two terms is defined by the model itself.'),
(16916, 79, 'It is usually directly or indirectly derived (e.g. by dimensional reduction) from the co-occurrence of those terms in the whole set of documents.'),
(16917, 79, '\n\nModels with transcendent term interdependencies allow a representation of interdependencies between terms, but they do not allege how the interdependency between two terms is defined.'),
(16918, 79, 'They relay an external source for the degree of interdependency between two terms.'),
(16919, 79, '(For example a human or sophisticated algorithms.)'),
(16920, 79, '\n\nPerformance and correctness measures[edit] Main article: Precision and recall Many different measures for evaluating the performance of information retrieval systems have been proposed.'),
(16921, 79, 'The measures require a collection of documents and a query.'),
(16922, 79, 'All common measures described here assume a ground truth notion of relevancy: every document is known to be either relevant or non-relevant to a particular query.'),
(16923, 79, 'In practice queries may be ill-posed and there may be different shades of relevancy.'),
(16924, 79, '\n\nPrecision[edit] Precision is the fraction of the documents retrieved that are relevant to the user's information need.'),
(16925, 79, '\n\n \\mbox{precision}=\\frac{|\\{\\mbox{relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{retrieved documents}\\}|} In binary classification, precision is analogous to positive predictive value.'),
(16926, 79, 'Precision takes all retrieved documents into account.'),
(16927, 79, 'It can also be evaluated at a given cut-off rank, considering only the topmost results returned by the system.'),
(16928, 79, 'This measure is called precision at n or P@n.'),
(16929, 79, '\n\nNote that the meaning and usage of "precision" in the field of Information Retrieval differs from the definition of accuracy and precision within other branches of science and statistics.'),
(16930, 79, '\n\nRecall[edit] Recall is the fraction of the documents that are relevant to the query that are successfully retrieved.'),
(16931, 79, '\n\n\\mbox{recall}=\\frac{|\\{\\mbox{relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{relevant documents}\\}|} In binary classification, recall is often called sensitivity.'),
(16932, 79, 'So it can be looked at as the probability that a relevant document is retrieved by the query.'),
(16933, 79, '\n\nIt is trivial to achieve recall of 100% by returning all documents in response to any query.'),
(16934, 79, 'Therefore recall alone is not enough but one needs to measure the number of non-relevant documents also, for example by computing the precision.'),
(16935, 79, '\n\nFall-out[edit] The proportion of non-relevant documents that are retrieved, out of all non-relevant documents available: \\mbox{fall-out}=\\frac{|\\{\\mbox{non-relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{non-relevant documents}\\}|} In binary classification, fall-out is closely related to specificity and is equal to (1-\\mbox{specificity}).'),
(16936, 79, 'It can be looked at as the probability that a non-relevant document is retrieved by the query.'),
(16937, 79, '\n\nIt is trivial to achieve fall-out of 0% by returning zero documents in response to any query.'),
(16938, 79, '\n\nF-measure[edit] Main article: F-score The weighted harmonic mean of precision and recall, the traditional F-measure or balanced F-score is: F = \\frac{2 \\cdot \\mathrm{precision} \\cdot \\mathrm{recall}}{(\\mathrm{precision} + \\mathrm{recall})}.'),
(16939, 79, '\\, This is also known as the F_1 measure, because recall and precision are evenly weighted.'),
(16940, 79, '\n\nThe general formula for non-negative real \\beta is: F_\\beta = \\frac{(1 + \\beta^2) \\cdot (\\mathrm{precision} \\cdot \\mathrm{recall})}{(\\beta^2 \\cdot \\mathrm{precision} + \\mathrm{recall})}\\,.'),
(16941, 79, '\n\nTwo other commonly used F measures are the F_{2} measure, which weights recall twice as much as precision, and the F_{0.5} measure, which weights precision twice as much as recall.'),
(16942, 79, '\n\nThe F-measure was derived by van Rijsbergen (1979) so that F_\\beta "measures the effectiveness of retrieval with respect to a user who attaches \\beta times as much importance to recall as precision".'),
(16943, 79, 'It is based on van Rijsbergen's effectiveness measure E = 1 - \\frac{1}{\\frac{\\alpha}{P} + \\frac{1-\\alpha}{R}}.'),
(16944, 79, 'Their relationship is F_\\beta = 1 - E where \\alpha=\\frac{1}{1 + \\beta^2}.'),
(16945, 79, '\n\nAverage precision[edit] Precision and recall are single-value metrics based on the whole list of documents returned by the system.'),
(16946, 79, 'For systems that return a ranked sequence of documents, it is desirable to also consider the order in which the returned documents are presented.'),
(16947, 79, 'By computing a precision and recall at every position in the ranked sequence of documents, one can plot a precision-recall curve, plotting precision p(r) as a function of recall r. Average precision computes the average value of p(r) over the interval from r=0 to r=1:[6] \\operatorname{AveP} = \\int_0^1 p(r)dr That is the area under the precision-recall curve.'),
(16948, 79, 'This integral is in practice replaced with a finite sum over every position in the ranked sequence of documents: \\operatorname{AveP} = \\sum_{k=1}^n P(k) \\Delta r(k) where k is the rank in the sequence of retrieved documents, n is the number of retrieved documents, P(k) is the precision at cut-off k in the list, and \\Delta r(k) is the change in recall from items k-1 to k.[6] This finite sum is equivalent to: \\operatorname{AveP} = \\frac{\\sum_{k=1}^n (P(k) \\times \\operatorname{rel}(k))}{\\mbox{number of relevant documents}} \\!'),
(16949, 79, '\n\nwhere \\operatorname{rel}(k) is an indicator function equaling 1 if the item at rank k is a relevant document, zero otherwise.'),
(16950, 79, '[7] Note that the average is over all relevant documents and the relevant documents not retrieved get a precision score of zero.'),
(16951, 79, '\n\nSome authors choose to interpolate the p(r) function to reduce the impact of "wiggles" in the curve.'),
(16952, 79, '[8][9] For example, the PASCAL Visual Object Classes challenge (a benchmark for computer vision object detection) computes average precision by averaging the precision over a set of evenly spaced recall levels {0, 0.1, 0.2, ... 1.0}:[8][9] \\operatorname{AveP} = \\frac{1}{11} \\sum_{r \\in \\{0, 0.1, \\ldots, 1.0\\}} p_{\\operatorname{interp}}(r) where p_{\\operatorname{interp}}(r) is an interpolated precision that takes the maximum precision over all recalls greater than r: p_{\\operatorname{interp}}(r) = \\operatorname{max}_{\\tilde{r}:\\tilde{r} \\geq r} p(\\tilde{r}).'),
(16953, 79, '\n\nAn alternative is to derive an analytical p(r) function by assuming a particular parametric distribution for the underlying decision values.'),
(16954, 79, 'For example, a binormal precision-recall curve can be obtained by assuming decision values in both classes to follow a Gaussian distribution.'),
(16955, 79, '[10] R-Precision[edit] Precision at R-th position in the ranking of results for a query that has R relevant documents.'),
(16956, 79, 'This measure is highly correlated to Average Precision.'),
(16957, 79, 'Also, Precision is equal to Recall at the R-th position.'),
(16958, 79, '\n\nMean average precision[edit] Mean average precision for a set of queries is the mean of the average precision scores for each query.'),
(16959, 79, '\n\n \\operatorname{MAP} = \\frac{\\sum_{q=1}^Q \\operatorname{AveP(q)}}{Q} \\!'),
(16960, 79, '\n\nwhere Q is the number of queries.'),
(16961, 79, '\n\nDiscounted cumulative gain[edit] Main article: Discounted cumulative gain DCG uses a graded relevance scale of documents from the result set to evaluate the usefulness, or gain, of a document based on its position in the result list.'),
(16962, 79, 'The premise of DCG is that highly relevant documents appearing lower in a search result list should be penalized as the graded relevance value is reduced logarithmically proportional to the position of the result.'),
(16963, 79, '\n\nThe DCG accumulated at a particular rank position p is defined as: \\mathrm{DCG_{p}} = rel_{1} + \\sum_{i=2}^{p} \\frac{rel_{i}}{\\log_{2}i}.'),
(16964, 79, '\n\nSince result set may vary in size among different queries or systems, to compare performances the normalised version of DCG uses an ideal DCG.'),
(16965, 79, 'To this end, it sorts documents of a result list by relevance, producing an ideal DCG at position p (IDCG_p), which normalizes the score: \\mathrm{nDCG_{p}} = \\frac{DCG_{p}}{IDCG{p}}.'),
(16966, 79, '\n\nThe nDCG values for all queries can be averaged to obtain a measure of the average performance of a ranking algorithm.'),
(16967, 79, 'Note that in a perfect ranking algorithm, the DCG_p will be the same as the IDCG_p producing an nDCG of 1.0.'),
(16968, 79, 'All nDCG calculations are then relative values on the interval 0.0 to 1.0 and so are cross-query comparable.'),
(16969, 79, '\n\nOther Measures[edit] Mean reciprocal rank Spearman's rank correlation coefficient Timeline[edit] Before the 1900s 1801: Joseph Marie Jacquard invents the Jacquard loom, the first machine to use punched cards to control a sequence of operations.'),
(16970, 79, '\n\n1880s: Herman Hollerith invents an electro-mechanical data tabulator using punch cards as a machine readable medium.'),
(16971, 79, '\n\n1890 Hollerith cards, keypunches and tabulators used to process the 1890 US Census data.'),
(16972, 79, '\n\n1920s-1930s Emanuel Goldberg submits patents for his "Statistical Machine a document search engine that used photoelectric cells and pattern recognition to search the metadata on rolls of microfilmed documents.'),
(16973, 79, '\n\n1940s1950s late 1940s: The US military confronted problems of indexing and retrieval of wartime scientific research documents captured from Germans.'),
(16974, 79, '\n\n1945: Vannevar Bush's As We May Think appeared in Atlantic Monthly.'),
(16975, 79, '\n\n1947: Hans Peter Luhn (research engineer at IBM since 1941) began work on a mechanized punch card-based system for searching chemical compounds.'),
(16976, 79, '\n\n1950s: Growing concern in the US for a "science gap" with the USSR motivated, encouraged funding and provided a backdrop for mechanized literature searching systems (Allen Kent et al.) and the invention of citation indexing (Eugene Garfield).'),
(16977, 79, '\n\n1950: The term "information retrieval" appears to have been coined by Calvin Mooers.'),
(16978, 79, '[11] 1951: Philip Bagley conducted the earliest experiment in computerized document retrieval in a master thesis at MIT.'),
(16979, 79, '[12] 1955: Allen Kent joined Case Western Reserve University, and eventually became associate director of the Center for Documentation and Communications Research.'),
(16980, 79, 'That same year, Kent and colleagues published a paper in American Documentation describing the precision and recall measures as well as detailing a proposed "framework" for evaluating an IR system which included statistical sampling methods for determining the number of relevant documents not retrieved.'),
(16981, 79, '\n\n1958: International Conference on Scientific Information Washington DC included consideration of IR systems as a solution to problems identified.'),
(16982, 79, 'See: Proceedings of the International Conference on Scientific Information, 1958 (National Academy of Sciences, Washington, DC, 1959) 1959: Hans Peter Luhn published "Auto-encoding of documents for information retrieval.'),
(16983, 79, '" 1960s: early 1960s: Gerard Salton began work on IR at Harvard, later moved to Cornell.'),
(16984, 79, '\n\n1960: Melvin Earl Maron and John Lary Kuhns[13] published "On relevance, probabilistic indexing, and information retrieval" in the Journal of the ACM 7(3):216244, July 1960.'),
(16985, 79, '\n\n1962: Cyril W. Cleverdon published early findings of the Cranfield studies, developing a model for IR system evaluation.'),
(16986, 79, 'See: Cyril W. Cleverdon, "Report on the Testing and Analysis of an Investigation into the Comparative Efficiency of Indexing Systems".'),
(16987, 79, 'Cranfield Collection of Aeronautics, Cranfield, England, 1962.'),
(16988, 79, '\n\nKent published Information Analysis and Retrieval.'),
(16989, 79, '\n\n1963: Weinberg report "Science, Government and Information" gave a full articulation of the idea of a "crisis of scientific information.'),
(16990, 79, '" The report was named after Dr. Alvin Weinberg.'),
(16991, 79, '\n\nJoseph Becker and Robert M. Hayes published text on information retrieval.'),
(16992, 79, 'Becker, Joseph; Hayes, Robert Mayo.'),
(16993, 79, 'Information storage and retrieval: tools, elements, theories.'),
(16994, 79, 'New York, Wiley (1963).'),
(16995, 79, '\n\n1964: Karen Sprck Jones finished her thesis at Cambridge, Synonymy and Semantic Classification, and continued work on computational linguistics as it applies to IR.'),
(16996, 79, '\n\nThe National Bureau of Standards sponsored a symposium titled "Statistical Association Methods for Mechanized Documentation.'),
(16997, 79, '" Several highly significant papers, including G. Salton's first published reference (we believe) to the SMART system.'),
(16998, 79, '\n\nmid-1960s: National Library of Medicine developed MEDLARS Medical Literature Analysis and Retrieval System, the first major machine-readable database and batch-retrieval system.'),
(16999, 79, '\n\nProject Intrex at MIT.'),
(17000, 79, '\n\n1965: J. C. R. Licklider published Libraries of the Future.'),
(17001, 79, '\n\n1966: Don Swanson was involved in studies at University of Chicago on Requirements for Future Catalogs.'),
(17002, 79, '\n\nlate 1960s: F. Wilfrid Lancaster completed evaluation studies of the MEDLARS system and published the first edition of his text on information retrieval.'),
(17003, 79, '\n\n1968: Gerard Salton published Automatic Information Organization and Retrieval.'),
(17004, 79, '\n\nJohn W. Sammon, Jr.'s RADC Tech report "Some Mathematics of Information Storage and Retrieval..." outlined the vector model.'),
(17005, 79, '\n\n1969: Sammon's "A nonlinear mapping for data structure analysis" (IEEE Transactions on Computers) was the first proposal for visualization interface to an IR system.'),
(17006, 79, '\n\n1970s early 1970s: First online systemsNLM's AIM-TWX, MEDLINE; Lockheed's Dialog; SDC's ORBIT.'),
(17007, 79, '\n\nTheodor Nelson promoting concept of hypertext, published Computer Lib/Dream Machines.'),
(17008, 79, '\n\n1971: Nicholas Jardine and Cornelis J. van Rijsbergen published "The use of hierarchic clustering in information retrieval", which articulated the "cluster hypothesis.'),
(17009, 79, '"[14] 1975: Three highly influential publications by Salton fully articulated his vector processing framework and term discrimination model: A Theory of Indexing (Society for Industrial and Applied Mathematics) A Theory of Term Importance in Automatic Text Analysis (JASIS v. 26) A Vector Space Model for Automatic Indexing (CACM 18:11) 1978: The First ACM SIGIR conference.'),
(17010, 79, '\n\n1979: C. J. van Rijsbergen published Information Retrieval (Butterworths).'),
(17011, 79, 'Heavy emphasis on probabilistic models.'),
(17012, 79, '\n\n1980s 1980: First international ACM SIGIR conference, joint with British Computer Society IR group in Cambridge.'),
(17013, 79, '\n\n1982: Nicholas J. Belkin, Robert N. Oddy, and Helen M. Brooks proposed the ASK (Anomalous State of Knowledge) viewpoint for information retrieval.'),
(17014, 79, 'This was an important concept, though their automated analysis tool proved ultimately disappointing.'),
(17015, 79, '\n\n1983: Salton (and Michael J. McGill) published Introduction to Modern Information Retrieval (McGraw-Hill), with heavy emphasis on vector models.'),
(17016, 79, '\n\n1985: David Blair and Bill Maron publish: An Evaluation of Retrieval Effectiveness for a Full-Text Document-Retrieval System mid-1980s: Efforts to develop end-user versions of commercial IR systems.'),
(17017, 79, '\n\n19851993: Key papers on and experimental systems for visualization interfaces.'),
(17018, 79, '\n\nWork by Donald B. Crouch, Robert R. Korfhage, Matthew Chalmers, Anselm Spoerri and others.'),
(17019, 79, '\n\n1989: First World Wide Web proposals by Tim Berners-Lee at CERN.'),
(17020, 79, '\n\n1990s 1992: First TREC conference.'),
(17021, 79, '\n\n1997: Publication of Korfhage's Information Storage and Retrieval[15] with emphasis on visualization and multi-reference point systems.'),
(17022, 79, '\n\nlate 1990s: Web search engines implementation of many features formerly found only in experimental IR systems.'),
(17023, 79, 'Search engines become the most common and maybe best instantiation of IR models, research, and implement by Jocanz.'),
(17024, 79, '\n\nAwards in the field[edit] Tony Kent Strix award Gerard Salton Award See also[edit] Adversarial information retrieval Collaborative information seeking Controlled vocabulary Cross-language information retrieval Data mining European Summer School in Information Retrieval Humancomputer information retrieval Information extraction Information Retrieval Facility Knowledge visualization Multimedia Information Retrieval List of information retrieval libraries Personal information management Relevance (Information Retrieval) Relevance feedback Rocchio Classification Search index Social information seeking Special Interest Group on Information Retrieval Structured Search Subject indexing Temporal information retrieval Tf-idf XML-Retrieval Key-objects References[edit] Wikiquote has a collection of quotations related to: Information retrieval ACM SIGIR: Information Retrieval Special Interest Group BCS IRSG: British Computer Society - Information Retrieval Specialist Group Text Retrieval Conference (TREC) Forum for Information Retrieval Evaluation (FIRE) Information Retrieval (online book) by C. J. van Rijsbergen Information Retrieval Wiki Information Retrieval Facility Information Retrieval @ DUTH Introduction to Information Retrieval (online book) by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schtze, Cambridge University Press.'),
(17025, 79, '2008.'),
(17026, 79, '\n\nCategories: Information retrievalNatural language processing Navigation menu Create accountLog inArticleTalkReadEditView history Edit links This page was last modified on 17 February 2014 at 05:19.'),
(17027, 79, '\n\nText is available under the Creative Commons Attribution-ShareAlike License; additional terms may apply.'),
(17028, 79, 'By using this site, you agree to the Terms of Use and Privacy Policy.'),
(17029, 79, '\n\nWikipedia is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.'),
(17030, 79, '\n\nPrivacy policyAbout WikipediaDisclaimersContact WikipediaDevelopersMobile viewWikimedia Foundation Powered by MediaWiki'),
(21846, 26, 'Cobra ( pronunciation (help·info)) is the Portuguese word for ``snake''''.'),
(21847, 26, 'In English and some other languages it has been adopted as the name for any of various species of venomous snakes.'),
(21848, 26, 'Most of those species are in the family Elapidae, all of which are venomous.'),
(21849, 26, 'Most of them can spread their neck ribs to form a flattened, widened hood.'),
(21850, 26, '\r\n\r\nNot all snakes commonly referred to as cobras are of the same genus, or even in the family Elapidae.'),
(21851, 26, 'The name ``cobra'''' is short for cobra de capelo or cobra-de-capelo, which is Portuguese for ``snake with hood'''', or ``hood-snake''''.'),
(21852, 26, '[1] In some modern languages, such as Afrikaans, the other part of the Portuguese name was adopted, and the predominant name for a cobra in Afrikaans is ``kapel''''.'),
(21853, 26, '[2][3] When disturbed, most of these snakes rear up and spread their necks (or hoods) in a characteristic threat display, making them a favorite of snake charmers because of the dramatic effect.'),
(21854, 26, 'Long ago, snake charming used to be a religious ritual, though nowadays it has become an entertainment.'),
(21855, 26, 'Cobras, which may live up to 20 years, are found from southern Africa, through southern Asia, to some of the islands of Southeast Asia.'),
(21856, 26, '\r\n\r\nCobra may refer to: Naja, also known as typical or ``true'''' cobras (known for raising the front part of the body and flattening the neck in a warning signal when alarmed), a group of elapids found in Africa and Asia.'),
(21857, 26, 'They include over 20 species, including Naja nivea, the cape, a moderately sized, highly venomous cobra inhabiting a wide variety of biomes across southern Africa, Cleopatra''s ``asp'''' (the Egyptian cobra, Naja haje) and the Asiatic spectacled cobra Naja naja and monocled cobra, Naja kaouthia.'),
(21858, 26, '\r\n\r\nSpitting cobras, a subset of Naja species with the ability to squirt venom from their fangs in self-defense Hemachatus haemachatus, ringhals, rinkhals or ring-necked spitting cobra, a species of the Elapidae found in Africa Any member of the genus Boulengerina, the water cobras, a group of Elapidae found in Africa (now regarded as species in the genus Naja) Paranaja multifasciata, the burrowing cobra, an African species of the Elapidae (now regarded as a species of Naja) Any member of the genus Aspidelaps, the shield cobras,[4] an African genus in the Elapidae Any species of Pseudohaje, the tree cobras, a genus of African Elapidae Ophiophagus hannah, the king cobra, an elapid found in parts of India and southern Asia Micrurus fulvius, the American cobra or eastern coral snake, a species of the Elapidae found in the southeastern United States Hydrodynastes gigas, the false water cobra, a mildly venomous member of the family Colubridae.'),
(21859, 26, 'It is indigenous to parts of South America and forms a hood if disturbed, though the hood is longer and narrower than those of ``true'''' cobras in the Elapidae.'),
(21860, 26, '\r\n\r\nNot a common name, but a highly obsolete synonym for the genus Bitis, the adders, a group of venomous vipers found in Africa and parts of the Middle East.'),
(21861, 26, '\r\n\r\nMost so-called, and all ``true'''', species of cobras belong to the family Elapidae.'),
(21862, 26, 'So do many other notoriously venomous snake species, including mambas, sea snakes, and coral snakes.'),
(21863, 26, 'The genus Naja contains over twenty species of cobras and is the most widespread and widely recognized genus of cobras, sometimes called the ``true'''' cobras.'),
(21864, 26, 'Members of the genus range from Africa through the Middle East, India, and Southeast Asia to Indonesia.'),
(21865, 26, '\r\n\r\nAlthough the king cobra, Ophiophagus hannah, the world''s longest venomous snake, is a member of the Elapidae and can raise a rather narrow hood if disturbed, it is not in the genus Naja and accordingly is not a true cobra.'),
(21866, 26, '\r\n\r\nThe other cobra of Asia is known as Asian, Indian or Spectacled cobra due to the eyeglass-shaped pattern on its skin.'),
(21867, 26, 'The hood of the Asian cobra is larger than that of the king cobra and is usually yellow or brown with a black and white spectacle pattern on top and two black and white spots on the lower surface.'),
(21868, 26, '\r\n\r\nThe Rinkhals, Hemachatus haemachatus also called a spitting cobra, is endemic to southern Africa.'),
(21869, 26, 'It also is not in the genus Naja Although the bites of some species are extremely dangerous, cobras of any kind have not been shown to attack people unprovoked, and practically never without a threat display, typically raising the hood and hissing.'),
(21870, 26, '\r\n\r\nVarious species of cobras prey mainly on other snakes, birds and small mammals, while its main natural predators in turn are other snakes, birds of prey, and small predatory mammals such as mongooses.'),
(21871, 26, '\r\n\r\nAlthough most cobras don''t make nests, some species protect their eggs until they hatch (incubation typically taking around 60 days'),
(21872, 80, 'Information retrieval is the activity of obtaining information resources relevant to an information need from a collection of information resources.'),
(21873, 80, 'Searches can be based on metadata or on full-text (or other content-based) indexing.'),
(21874, 80, '\n\nAutomated information retrieval systems are used to reduce what has been called "information overload".'),
(21875, 80, 'Many universities and public libraries use IR systems to provide access to books, journals and other documents.'),
(21876, 80, 'Web search engines are the most visible IR applications.'),
(21877, 80, '\n\nContents [hide] 1 Overview 2 History 3 Model types 3.1 First dimension: mathematical basis 3.2 Second dimension: properties of the model 4 Performance and correctness measures 4.1 Precision 4.2 Recall 4.3 Fall-out 4.4 F-measure 4.5 Average precision 4.6 R-Precision 4.7 Mean average precision 4.8 Discounted cumulative gain 4.9 Other Measures 4.10 Timeline 5 Awards in the field 6 See also 7 References 8 External links Overview[edit] An information retrieval process begins when a user enters a query into the system.'),
(21878, 80, 'Queries are formal statements of information needs, for example search strings in web search engines.'),
(21879, 80, 'In information retrieval a query does not uniquely identify a single object in the collection.'),
(21880, 80, 'Instead, several objects may match the query, perhaps with different degrees of relevancy.'),
(21881, 80, '\n\nAn object is an entity that is represented by information in a database.'),
(21882, 80, 'User queries are matched against the database information.'),
(21883, 80, 'Depending on the application the data objects may be, for example, text documents, images,[1] audio,[2] mind maps[3] or videos.'),
(21884, 80, 'Often the documents themselves are not kept or stored directly in the IR system, but are instead represented in the system by document surrogates or metadata.'),
(21885, 80, '\n\nMost IR systems compute a numeric score on how well each object in the database matches the query, and rank the objects according to this value.'),
(21886, 80, 'The top ranking objects are then shown to the user.'),
(21887, 80, 'The process may then be iterated if the user wishes to refine the query.'),
(21888, 80, '[4] History[edit] But do you know that, although I have kept the diary [on a phonograph] for months past, it never once struck me how I was going to find any particular part of it in case I wanted to look it up?'),
(21889, 80, '\n\nDr Seward, Bram Stoker's Dracula, 1897 The idea of using computers to search for relevant pieces of information was popularized in the article As We May Think by Vannevar Bush in 1945.'),
(21890, 80, '[5] The first automated information retrieval systems were introduced in the 1950s and 1960s.'),
(21891, 80, 'By 1970 several different techniques had been shown to perform well on small text corpora such as the Cranfield collection (several thousand documents).'),
(21892, 80, '[5] Large-scale retrieval systems, such as the Lockheed Dialog system, came into use early in the 1970s.'),
(21893, 80, '\n\nIn 1992, the US Department of Defense along with the National Institute of Standards and Technology (NIST), cosponsored the Text Retrieval Conference (TREC) as part of the TIPSTER text program.'),
(21894, 80, 'The aim of this was to look into the information retrieval community by supplying the infrastructure that was needed for evaluation of text retrieval methodologies on a very large text collection.'),
(21895, 80, 'This catalyzed research on methods that scale to huge corpora.'),
(21896, 80, 'The introduction of web search engines has boosted the need for very large scale retrieval systems even further.'),
(21897, 80, '\n\nModel types[edit] Categorization of IR-models (translated from German entry, original source Dominik Kuropka).'),
(21898, 80, '\n\nFor effectively retrieving relevant documents by IR strategies, the documents are typically transformed into a suitable representation.'),
(21899, 80, 'Each retrieval strategy incorporate a specific model for its document representation purposes.'),
(21900, 80, 'The picture on the right illustrates the relationship of some common models.'),
(21901, 80, 'In the picture, the models are categorized according to two dimensions: the mathematical basis and the properties of the model.'),
(21902, 80, '\n\nFirst dimension: mathematical basis[edit] Set-theoretic models represent documents as sets of words or phrases.'),
(21903, 80, 'Similarities are usually derived from set-theoretic operations on those sets.'),
(21904, 80, 'Common models are: Standard Boolean model Extended Boolean model Fuzzy retrieval Algebraic models represent documents and queries usually as vectors, matrices, or tuples.'),
(21905, 80, 'The similarity of the query vector and document vector is represented as a scalar value.'),
(21906, 80, '\n\nVector space model Generalized vector space model (Enhanced) Topic-based Vector Space Model Extended Boolean model Latent semantic indexing aka latent semantic analysis Probabilistic models treat the process of document retrieval as a probabilistic inference.'),
(21907, 80, 'Similarities are computed as probabilities that a document is relevant for a given query.'),
(21908, 80, 'Probabilistic theorems like the Bayes' theorem are often used in these models.'),
(21909, 80, '\n\nBinary Independence Model Probabilistic relevance model on which is based the okapi (BM25) relevance function Uncertain inference Language models Divergence-from-randomness model Latent Dirichlet allocation Feature-based retrieval models view documents as vectors of values of feature functions (or just features) and seek the best way to combine these features into a single relevance score, typically by learning to rank methods.'),
(21910, 80, 'Feature functions are arbitrary functions of document and query, and as such can easily incorporate almost any other retrieval model as just a yet another feature.'),
(21911, 80, '\n\nSecond dimension: properties of the model[edit] Models without term-interdependencies treat different terms/words as independent.'),
(21912, 80, 'This fact is usually represented in vector space models by the orthogonality assumption of term vectors or in probabilistic models by an independency assumption for term variables.'),
(21913, 80, '\n\nModels with immanent term interdependencies allow a representation of interdependencies between terms.'),
(21914, 80, 'However the degree of the interdependency between two terms is defined by the model itself.'),
(21915, 80, 'It is usually directly or indirectly derived (e.g. by dimensional reduction) from the co-occurrence of those terms in the whole set of documents.'),
(21916, 80, '\n\nModels with transcendent term interdependencies allow a representation of interdependencies between terms, but they do not allege how the interdependency between two terms is defined.'),
(21917, 80, 'They relay an external source for the degree of interdependency between two terms.'),
(21918, 80, '(For example a human or sophisticated algorithms.)'),
(21919, 80, '\n\nPerformance and correctness measures[edit] Main article: Precision and recall Many different measures for evaluating the performance of information retrieval systems have been proposed.'),
(21920, 80, 'The measures require a collection of documents and a query.'),
(21921, 80, 'All common measures described here assume a ground truth notion of relevancy: every document is known to be either relevant or non-relevant to a particular query.'),
(21922, 80, 'In practice queries may be ill-posed and there may be different shades of relevancy.'),
(21923, 80, '\n\nPrecision[edit] Precision is the fraction of the documents retrieved that are relevant to the user's information need.'),
(21924, 80, '\n\n \\mbox{precision}=\\frac{|\\{\\mbox{relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{retrieved documents}\\}|} In binary classification, precision is analogous to positive predictive value.'),
(21925, 80, 'Precision takes all retrieved documents into account.'),
(21926, 80, 'It can also be evaluated at a given cut-off rank, considering only the topmost results returned by the system.'),
(21927, 80, 'This measure is called precision at n or P@n.'),
(21928, 80, '\n\nNote that the meaning and usage of "precision" in the field of Information Retrieval differs from the definition of accuracy and precision within other branches of science and statistics.'),
(21929, 80, '\n\nRecall[edit] Recall is the fraction of the documents that are relevant to the query that are successfully retrieved.'),
(21930, 80, '\n\n\\mbox{recall}=\\frac{|\\{\\mbox{relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{relevant documents}\\}|} In binary classification, recall is often called sensitivity.'),
(21931, 80, 'So it can be looked at as the probability that a relevant document is retrieved by the query.'),
(21932, 80, '\n\nIt is trivial to achieve recall of 100% by returning all documents in response to any query.'),
(21933, 80, 'Therefore recall alone is not enough but one needs to measure the number of non-relevant documents also, for example by computing the precision.'),
(21934, 80, '\n\nFall-out[edit] The proportion of non-relevant documents that are retrieved, out of all non-relevant documents available: \\mbox{fall-out}=\\frac{|\\{\\mbox{non-relevant documents}\\}\\cap\\{\\mbox{retrieved documents}\\}|}{|\\{\\mbox{non-relevant documents}\\}|} In binary classification, fall-out is closely related to specificity and is equal to (1-\\mbox{specificity}).'),
(21935, 80, 'It can be looked at as the probability that a non-relevant document is retrieved by the query.'),
(21936, 80, '\n\nIt is trivial to achieve fall-out of 0% by returning zero documents in response to any query.'),
(21937, 80, '\n\nF-measure[edit] Main article: F-score The weighted harmonic mean of precision and recall, the traditional F-measure or balanced F-score is: F = \\frac{2 \\cdot \\mathrm{precision} \\cdot \\mathrm{recall}}{(\\mathrm{precision} + \\mathrm{recall})}.');
INSERT INTO `sentences` (`id`, `document_id`, `sentence`) VALUES
(21938, 80, '\\, This is also known as the F_1 measure, because recall and precision are evenly weighted.'),
(21939, 80, '\n\nThe general formula for non-negative real \\beta is: F_\\beta = \\frac{(1 + \\beta^2) \\cdot (\\mathrm{precision} \\cdot \\mathrm{recall})}{(\\beta^2 \\cdot \\mathrm{precision} + \\mathrm{recall})}\\,.'),
(21940, 80, '\n\nTwo other commonly used F measures are the F_{2} measure, which weights recall twice as much as precision, and the F_{0.5} measure, which weights precision twice as much as recall.'),
(21941, 80, '\n\nThe F-measure was derived by van Rijsbergen (1979) so that F_\\beta "measures the effectiveness of retrieval with respect to a user who attaches \\beta times as much importance to recall as precision".'),
(21942, 80, 'It is based on van Rijsbergen's effectiveness measure E = 1 - \\frac{1}{\\frac{\\alpha}{P} + \\frac{1-\\alpha}{R}}.'),
(21943, 80, 'Their relationship is F_\\beta = 1 - E where \\alpha=\\frac{1}{1 + \\beta^2}.'),
(21944, 80, '\n\nAverage precision[edit] Precision and recall are single-value metrics based on the whole list of documents returned by the system.'),
(21945, 80, 'For systems that return a ranked sequence of documents, it is desirable to also consider the order in which the returned documents are presented.'),
(21946, 80, 'By computing a precision and recall at every position in the ranked sequence of documents, one can plot a precision-recall curve, plotting precision p(r) as a function of recall r. Average precision computes the average value of p(r) over the interval from r=0 to r=1:[6] \\operatorname{AveP} = \\int_0^1 p(r)dr That is the area under the precision-recall curve.'),
(21947, 80, 'This integral is in practice replaced with a finite sum over every position in the ranked sequence of documents: \\operatorname{AveP} = \\sum_{k=1}^n P(k) \\Delta r(k) where k is the rank in the sequence of retrieved documents, n is the number of retrieved documents, P(k) is the precision at cut-off k in the list, and \\Delta r(k) is the change in recall from items k-1 to k.[6] This finite sum is equivalent to: \\operatorname{AveP} = \\frac{\\sum_{k=1}^n (P(k) \\times \\operatorname{rel}(k))}{\\mbox{number of relevant documents}} \\!'),
(21948, 80, '\n\nwhere \\operatorname{rel}(k) is an indicator function equaling 1 if the item at rank k is a relevant document, zero otherwise.'),
(21949, 80, '[7] Note that the average is over all relevant documents and the relevant documents not retrieved get a precision score of zero.'),
(21950, 80, '\n\nSome authors choose to interpolate the p(r) function to reduce the impact of "wiggles" in the curve.'),
(21951, 80, '[8][9] For example, the PASCAL Visual Object Classes challenge (a benchmark for computer vision object detection) computes average precision by averaging the precision over a set of evenly spaced recall levels {0, 0.1, 0.2, ... 1.0}:[8][9] \\operatorname{AveP} = \\frac{1}{11} \\sum_{r \\in \\{0, 0.1, \\ldots, 1.0\\}} p_{\\operatorname{interp}}(r) where p_{\\operatorname{interp}}(r) is an interpolated precision that takes the maximum precision over all recalls greater than r: p_{\\operatorname{interp}}(r) = \\operatorname{max}_{\\tilde{r}:\\tilde{r} \\geq r} p(\\tilde{r}).'),
(21952, 80, '\n\nAn alternative is to derive an analytical p(r) function by assuming a particular parametric distribution for the underlying decision values.'),
(21953, 80, 'For example, a binormal precision-recall curve can be obtained by assuming decision values in both classes to follow a Gaussian distribution.'),
(21954, 80, '[10] R-Precision[edit] Precision at R-th position in the ranking of results for a query that has R relevant documents.'),
(21955, 80, 'This measure is highly correlated to Average Precision.'),
(21956, 80, 'Also, Precision is equal to Recall at the R-th position.'),
(21957, 80, '\n\nMean average precision[edit] Mean average precision for a set of queries is the mean of the average precision scores for each query.'),
(21958, 80, '\n\n \\operatorname{MAP} = \\frac{\\sum_{q=1}^Q \\operatorname{AveP(q)}}{Q} \\!'),
(21959, 80, '\n\nwhere Q is the number of queries.'),
(21960, 80, '\n\nDiscounted cumulative gain[edit] Main article: Discounted cumulative gain DCG uses a graded relevance scale of documents from the result set to evaluate the usefulness, or gain, of a document based on its position in the result list.'),
(21961, 80, 'The premise of DCG is that highly relevant documents appearing lower in a search result list should be penalized as the graded relevance value is reduced logarithmically proportional to the position of the result.'),
(21962, 80, '\n\nThe DCG accumulated at a particular rank position p is defined as: \\mathrm{DCG_{p}} = rel_{1} + \\sum_{i=2}^{p} \\frac{rel_{i}}{\\log_{2}i}.'),
(21963, 80, '\n\nSince result set may vary in size among different queries or systems, to compare performances the normalised version of DCG uses an ideal DCG.'),
(21964, 80, 'To this end, it sorts documents of a result list by relevance, producing an ideal DCG at position p (IDCG_p), which normalizes the score: \\mathrm{nDCG_{p}} = \\frac{DCG_{p}}{IDCG{p}}.'),
(21965, 80, '\n\nThe nDCG values for all queries can be averaged to obtain a measure of the average performance of a ranking algorithm.'),
(21966, 80, 'Note that in a perfect ranking algorithm, the DCG_p will be the same as the IDCG_p producing an nDCG of 1.0.'),
(21967, 80, 'All nDCG calculations are then relative values on the interval 0.0 to 1.0 and so are cross-query comparable.'),
(21968, 80, '\n\nOther Measures[edit] Mean reciprocal rank Spearman's rank correlation coefficient Timeline[edit] Before the 1900s 1801: Joseph Marie Jacquard invents the Jacquard loom, the first machine to use punched cards to control a sequence of operations.'),
(21969, 80, '\n\n1880s: Herman Hollerith invents an electro-mechanical data tabulator using punch cards as a machine readable medium.'),
(21970, 80, '\n\n1890 Hollerith cards, keypunches and tabulators used to process the 1890 US Census data.'),
(21971, 80, '\n\n1920s-1930s Emanuel Goldberg submits patents for his "Statistical Machine a document search engine that used photoelectric cells and pattern recognition to search the metadata on rolls of microfilmed documents.'),
(21972, 80, '\n\n1940s1950s late 1940s: The US military confronted problems of indexing and retrieval of wartime scientific research documents captured from Germans.'),
(21973, 80, '\n\n1945: Vannevar Bush's As We May Think appeared in Atlantic Monthly.'),
(21974, 80, '\n\n1947: Hans Peter Luhn (research engineer at IBM since 1941) began work on a mechanized punch card-based system for searching chemical compounds.'),
(21975, 80, '\n\n1950s: Growing concern in the US for a "science gap" with the USSR motivated, encouraged funding and provided a backdrop for mechanized literature searching systems (Allen Kent et al.) and the invention of citation indexing (Eugene Garfield).'),
(21976, 80, '\n\n1950: The term "information retrieval" appears to have been coined by Calvin Mooers.'),
(21977, 80, '[11] 1951: Philip Bagley conducted the earliest experiment in computerized document retrieval in a master thesis at MIT.'),
(21978, 80, '[12] 1955: Allen Kent joined Case Western Reserve University, and eventually became associate director of the Center for Documentation and Communications Research.'),
(21979, 80, 'That same year, Kent and colleagues published a paper in American Documentation describing the precision and recall measures as well as detailing a proposed "framework" for evaluating an IR system which included statistical sampling methods for determining the number of relevant documents not retrieved.'),
(21980, 80, '\n\n1958: International Conference on Scientific Information Washington DC included consideration of IR systems as a solution to problems identified.'),
(21981, 80, 'See: Proceedings of the International Conference on Scientific Information, 1958 (National Academy of Sciences, Washington, DC, 1959) 1959: Hans Peter Luhn published "Auto-encoding of documents for information retrieval.'),
(21982, 80, '" 1960s: early 1960s: Gerard Salton began work on IR at Harvard, later moved to Cornell.'),
(21983, 80, '\n\n1960: Melvin Earl Maron and John Lary Kuhns[13] published "On relevance, probabilistic indexing, and information retrieval" in the Journal of the ACM 7(3):216244, July 1960.'),
(21984, 80, '\n\n1962: Cyril W. Cleverdon published early findings of the Cranfield studies, developing a model for IR system evaluation.'),
(21985, 80, 'See: Cyril W. Cleverdon, "Report on the Testing and Analysis of an Investigation into the Comparative Efficiency of Indexing Systems".'),
(21986, 80, 'Cranfield Collection of Aeronautics, Cranfield, England, 1962.'),
(21987, 80, '\n\nKent published Information Analysis and Retrieval.'),
(21988, 80, '\n\n1963: Weinberg report "Science, Government and Information" gave a full articulation of the idea of a "crisis of scientific information.'),
(21989, 80, '" The report was named after Dr. Alvin Weinberg.'),
(21990, 80, '\n\nJoseph Becker and Robert M. Hayes published text on information retrieval.'),
(21991, 80, 'Becker, Joseph; Hayes, Robert Mayo.'),
(21992, 80, 'Information storage and retrieval: tools, elements, theories.'),
(21993, 80, 'New York, Wiley (1963).'),
(21994, 80, '\n\n1964: Karen Sprck Jones finished her thesis at Cambridge, Synonymy and Semantic Classification, and continued work on computational linguistics as it applies to IR.'),
(21995, 80, '\n\nThe National Bureau of Standards sponsored a symposium titled "Statistical Association Methods for Mechanized Documentation.'),
(21996, 80, '" Several highly significant papers, including G. Salton's first published reference (we believe) to the SMART system.'),
(21997, 80, '\n\nmid-1960s: National Library of Medicine developed MEDLARS Medical Literature Analysis and Retrieval System, the first major machine-readable database and batch-retrieval system.'),
(21998, 80, '\n\nProject Intrex at MIT.'),
(21999, 80, '\n\n1965: J. C. R. Licklider published Libraries of the Future.'),
(22000, 80, '\n\n1966: Don Swanson was involved in studies at University of Chicago on Requirements for Future Catalogs.'),
(22001, 80, '\n\nlate 1960s: F. Wilfrid Lancaster completed evaluation studies of the MEDLARS system and published the first edition of his text on information retrieval.'),
(22002, 80, '\n\n1968: Gerard Salton published Automatic Information Organization and Retrieval.'),
(22003, 80, '\n\nJohn W. Sammon, Jr.'s RADC Tech report "Some Mathematics of Information Storage and Retrieval..." outlined the vector model.'),
(22004, 80, '\n\n1969: Sammon's "A nonlinear mapping for data structure analysis" (IEEE Transactions on Computers) was the first proposal for visualization interface to an IR system.'),
(22005, 80, '\n\n1970s early 1970s: First online systemsNLM's AIM-TWX, MEDLINE; Lockheed's Dialog; SDC's ORBIT.'),
(22006, 80, '\n\nTheodor Nelson promoting concept of hypertext, published Computer Lib/Dream Machines.'),
(22007, 80, '\n\n1971: Nicholas Jardine and Cornelis J. van Rijsbergen published "The use of hierarchic clustering in information retrieval", which articulated the "cluster hypothesis.'),
(22008, 80, '"[14] 1975: Three highly influential publications by Salton fully articulated his vector processing framework and term discrimination model: A Theory of Indexing (Society for Industrial and Applied Mathematics) A Theory of Term Importance in Automatic Text Analysis (JASIS v. 26) A Vector Space Model for Automatic Indexing (CACM 18:11) 1978: The First ACM SIGIR conference.'),
(22009, 80, '\n\n1979: C. J. van Rijsbergen published Information Retrieval (Butterworths).'),
(22010, 80, 'Heavy emphasis on probabilistic models.'),
(22011, 80, '\n\n1980s 1980: First international ACM SIGIR conference, joint with British Computer Society IR group in Cambridge.'),
(22012, 80, '\n\n1982: Nicholas J. Belkin, Robert N. Oddy, and Helen M. Brooks proposed the ASK (Anomalous State of Knowledge) viewpoint for information retrieval.'),
(22013, 80, 'This was an important concept, though their automated analysis tool proved ultimately disappointing.'),
(22014, 80, '\n\n1983: Salton (and Michael J. McGill) published Introduction to Modern Information Retrieval (McGraw-Hill), with heavy emphasis on vector models.'),
(22015, 80, '\n\n1985: David Blair and Bill Maron publish: An Evaluation of Retrieval Effectiveness for a Full-Text Document-Retrieval System mid-1980s: Efforts to develop end-user versions of commercial IR systems.'),
(22016, 80, '\n\n19851993: Key papers on and experimental systems for visualization interfaces.'),
(22017, 80, '\n\nWork by Donald B. Crouch, Robert R. Korfhage, Matthew Chalmers, Anselm Spoerri and others.'),
(22018, 80, '\n\n1989: First World Wide Web proposals by Tim Berners-Lee at CERN.'),
(22019, 80, '\n\n1990s 1992: First TREC conference.'),
(22020, 80, '\n\n1997: Publication of Korfhage's Information Storage and Retrieval[15] with emphasis on visualization and multi-reference point systems.'),
(22021, 80, '\n\nlate 1990s: Web search engines implementation of many features formerly found only in experimental IR systems.'),
(22022, 80, 'Search engines become the most common and maybe best instantiation of IR models, research, and implement by Jocanz.'),
(22023, 80, '\n\nAwards in the field[edit] Tony Kent Strix award Gerard Salton Award See also[edit] Adversarial information retrieval Collaborative information seeking Controlled vocabulary Cross-language information retrieval Data mining European Summer School in Information Retrieval Humancomputer information retrieval Information extraction Information Retrieval Facility Knowledge visualization Multimedia Information Retrieval List of information retrieval libraries Personal information management Relevance (Information Retrieval) Relevance feedback Rocchio Classification Search index Social information seeking Special Interest Group on Information Retrieval Structured Search Subject indexing Temporal information retrieval Tf-idf XML-Retrieval Key-objects References[edit] Wikiquote has a collection of quotations related to: Information retrieval ACM SIGIR: Information Retrieval Special Interest Group BCS IRSG: British Computer Society - Information Retrieval Specialist Group Text Retrieval Conference (TREC) Forum for Information Retrieval Evaluation (FIRE) Information Retrieval (online book) by C. J. van Rijsbergen Information Retrieval Wiki Information Retrieval Facility Information Retrieval @ DUTH Introduction to Information Retrieval (online book) by Christopher D. Manning, Prabhakar Raghavan and Hinrich Schtze, Cambridge University Press.'),
(22024, 80, '2008.'),
(22025, 80, '\n\nCategories: Information retrievalNatural language processing Navigation menu Create accountLog inArticleTalkReadEditView history Edit links This page was last modified on 17 February 2014 at 05:19.'),
(22026, 80, '\n\nText is available under the Creative Commons Attribution-ShareAlike License; additional terms may apply.'),
(22027, 80, 'By using this site, you agree to the Terms of Use and Privacy Policy.'),
(22028, 80, '\n\nWikipedia is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.'),
(22029, 80, '\n\nPrivacy policyAbout WikipediaDisclaimersContact WikipediaDevelopersMobile viewWikimedia Foundation Powered by MediaWiki'),
(22030, 81, 'Climate change is a significant and lasting change in the statistical distribution of weather patterns over periods ranging from decades to millions of years.'),
(22031, 81, 'It may be a change in average weather conditions, or in the distribution of weather around the average conditions (i.e., more or fewer extreme weather events).'),
(22032, 81, 'Climate change is caused by factors such as biotic processes, variations in solar radiation received by Earth, plate tectonics, and volcanic eruptions.'),
(22033, 81, 'Certain human activities have also been identified as significant causes of recent climate change, often referred to as "global warming".'),
(22034, 81, '\n\n Scientists actively work to understand past and future climate by using observations and theoretical models.'),
(22035, 81, 'A climate record extending deep into the Earth's past has been assembled, and continues to be built up, based on geological evidence from borehole temperature profiles, cores removed from deep accumulations of ice, floral and faunal records, glacial and periglacial processes, stable-isotope and other analyses of sediment layers, and records of past sea levels.'),
(22036, 81, 'More recent data are provided by the instrumental record.'),
(22037, 81, 'General circulation models, based on the physical sciences, are often used in theoretical approaches to match past climate data, make future projections, and link causes and effects in climate change.'),
(22038, 81, '\n\n The most general definition of climate change is a change in the statistical properties of the climate system when considered over long periods of time, regardless of cause.'),
(22039, 81, 'Accordingly, fluctuations over periods shorter than a few decades, such as El Nio, do not represent climate change.'),
(22040, 81, '\n\nThe term sometimes is used to refer specifically to climate change caused by human activity, as opposed to changes in climate that may have resulted as part of Earth's natural processes.'),
(22041, 81, 'In this sense, especially in the context of environmental policy, the term climate change has become synonymous with anthropogenic global warming.'),
(22042, 81, 'Within scientific journals, global warming refers to surface temperature increases while climate change includes global warming and everything else that increasing greenhouse gas levels will affect.'),
(22043, 81, '\n\n On the broadest scale, the rate at which energy is received from the sun and the rate at which it is lost to space determine the equilibrium temperature and climate of Earth.'),
(22044, 81, 'This energy is distributed around the globe by winds, ocean currents, and other mechanisms to affect the climates of different regions.'),
(22045, 81, '\n\n Factors that can shape climate are called climate forcings or "forcing mechanisms".'),
(22046, 81, 'These include processes such as variations in solar radiation, variations in the Earth's orbit, mountain-building and continental drift and changes in greenhouse gas concentrations.'),
(22047, 81, 'There are a variety of climate change feedbacks that can either amplify or diminish the initial forcing.'),
(22048, 81, 'Some parts of the climate system, such as the oceans and ice caps, respond slowly in reaction to climate forcings, while others respond more quickly.'),
(22049, 81, '\n\n Forcing mechanisms can be either "internal" or "external".'),
(22050, 81, 'Internal forcing mechanisms are natural processes within the climate system itself (e.g., the thermohaline circulation).'),
(22051, 81, 'External forcing mechanisms can be either natural (e.g., changes in solar output) or anthropogenic (e.g., increased emissions of greenhouse gases).'),
(22052, 81, '\n\n Whether the initial forcing mechanism is internal or external, the response of the climate system might be fast (e.g., a sudden cooling due to airborne volcanic ash reflecting sunlight), slow (e.g. thermal expansion of warming ocean water), or a combination (e.g., sudden loss of albedo in the arctic ocean as sea ice melts, followed by more gradual thermal expansion of the water).'),
(22053, 81, 'Therefore, the climate system can respond abruptly, but the full response to forcing mechanisms might not be fully developed for centuries or even longer.'),
(22054, 81, '\n\n'),
(22055, 82, 'If you think of climate change as a hazard for some far-off polar bears years from now, you're mistaken.'),
(22056, 82, 'That's the message from top climate scientists gathering in Japan this week to assess the impact of global warming.'),
(22057, 82, 'In fact, they will say, the dangers of a warming Earth are immediate and very human.'),
(22058, 82, '"The polar bear is us," says Patricia Romero Lankao of the federally financed National Center for Atmospheric Research in Boulder, Colorado, referring to the first species to be listed as threatened by global warming due to melting sea ice.'),
(22059, 82, 'She will be among the more than 60 scientists in Japan to finish writing a massive and authoritative report on the impacts of global warming.'),
(22060, 82, 'With representatives from about 100 governments at this week's meeting of the Intergovernmental Panel on Climate Change, they'll wrap up a summary that tells world leaders how bad the problem is.'),
(22061, 82, '\n\n The key message from leaked drafts and interviews with the authors and other scientists: The big risks and overall effects of global warming are far more immediate and local than scientists once thought.'),
(22062, 82, 'It's not just about melting ice, threatened animals and plants.'),
(22063, 82, 'It's about the human problems of hunger, disease, drought, flooding, refugees and war, becoming worse.'),
(22064, 82, '\n\n The report says scientists have already observed many changes from warming, such as an increase in heat waves in North America, Europe, Africa and Asia.'),
(22065, 82, 'Severe floods, such as the one that displaced 90,000 people in Mozambique in 2008, are now more common in Africa and Australia.'),
(22066, 82, 'Europe and North America are getting more intense downpours that can be damaging.'),
(22067, 82, 'Melting ice in the Arctic is not only affecting the polar bear, but already changing the culture and livelihoods of indigenous people in northern Canada.'),
(22068, 82, '\n\n Past panel reports have been ignored because global warming's effects seemed too distant in time and location, says Pennsylvania State University scientist Michael Mann.'),
(22069, 82, 'This report finds "It's not far-off in the future and it's not exotic creatures it's us and now," says Mann, who didn't work on this latest report.'),
(22070, 82, '\n\n The United Nations established the climate change panel in 1988 and its work is done by three groups.'),
(22071, 82, 'One looks at the science behind global warming.'),
(22072, 82, 'The group meeting in Japan beginning Tuesday studies its impacts.'),
(22073, 82, 'And a third looks at ways to slow warming.'),
(22074, 82, 'Its reports have reiterated what nearly every major scientific organization has said: The burning of coal, oil and gas is producing an increasing amount of heat-trapping greenhouse gases, such as carbon dioxide.'),
(22075, 82, 'Those gases change Earth's climate, bringing warmer temperatures and more extreme weather, and the problem is worsening.'),
(22076, 82, '\n\n The panel won the Nobel Peace Prize in 2007, months after it issued its last report.'),
(22077, 82, 'Since then, the impact group has been reviewing the latest research and writing 30 chapters on warming's effects and regional impacts.'),
(22078, 82, 'Those chapters haven't been officially released but were posted on a skeptical website.'),
(22079, 82, 'The key message can be summed up in one word that the overall report uses more than 5,000 times: risk.'),
(22080, 82, '"Climate change really is a challenge in managing risks," says the report's chief author, Chris Field of the Carnegie Institution of Science in California.'),
(22081, 82, '"It's very clear that we are not prepared for the kind of events we're seeing.'),
(22082, 82, '" Already the effects of global warming are "widespread and consequential," says one part of the larger report, noting that science has compiled more evidence and done much more research since the last report in 2007.'),
(22083, 82, '\n\n If climate change continues, the panel's larger report predicts these harms: VIOLENCE: For the first time, the panel is emphasizing the nuanced link between conflict and warming temperatures.'),
(22084, 82, 'Participating scientists say warming won't cause wars, but it will add a destabilizing factor that will make existing threats worse.'),
(22085, 82, '\n\n FOOD: Global food prices will rise between 3 and 84 percent by 2050 because of warmer temperatures and changes in rain patterns.'),
(22086, 82, 'Hotspots of hunger may emerge in cities.'),
(22087, 82, '\n\n WATER: About one-third of the world's population will see groundwater supplies drop by more than 10 percent by 2080, when compared with 1980 levels.'),
(22088, 82, 'For every degree of warming, more of the world will have significantly less water available.'),
(22089, 82, '\n\n HEALTH: Major increases in health problems are likely, with more illnesses and injury from heat waves and fires and more food and water-borne diseases.'),
(22090, 82, 'But the report also notes that warming's effects on health is relatively small compared with other problems, like poverty.'),
(22091, 82, '\n\n WEALTH: Many of the poor will get poorer.'),
(22092, 82, 'Economic growth and poverty reduction will slow down.'),
(22093, 82, 'If temperatures rise high enough, the world's overall income may start to go down, by as much as 2 percent, but that's difficult to forecast.'),
(22094, 82, '\n\n According to the report, risks from warming-related extreme weather, now at a moderate level, are likely to get worse with just a bit more warming.'),
(22095, 82, 'While it doesn't say climate change caused the events, the report cites droughts in northern Mexico and the south-central United States, and hurricanes such as 2012's Sandy, as illustrations of how vulnerable people are to weather extremes.'),
(22096, 82, 'It does say the deadly European heat wave in 2003 was made more likely because of global warming.'),
(22097, 82, '\n\n Texas Tech University climate scientist Katharine Hayhoe, who was not part of this report team, says the important nuance is how climate change interacts with other human problems: "It's interacting and exacerbating problems we already have today.'),
(22098, 82, '"University of Colorado science policy professor Roger Pielke Jr., a past critic of the panel's impact reports, said after reading the draft summary, "it's a lot of important work ... They made vast improvements to the quality of their assessments.'),
(22099, 82, '" Another critic, University of Alabama Huntsville professor John Christy, accepts man-made global warming but thinks its risks are overblown when compared with something like poverty.'),
(22100, 82, 'Climate change is not among the developing world's main problems, he says.'),
(22101, 82, '\n\n But other scientists say Christy is misguided.'),
(22102, 82, 'Earlier this month, the world's largest scientific organization, the American Association for the Advancement of Science, published a new fact sheet on global warming.'),
(22103, 82, 'It said: "Climate change is already happening.'),
(22104, 82, 'More heat waves, greater sea level rise and other changes with consequences for human health, natural ecosystems and agriculture are already occurring in the United States and worldwide.'),
(22105, 82, 'These problems are very likely to become worse over the next 10 to 20 years and beyond.'),
(22106, 82, '" Texas Tech's Hayhoe says scientists in the past may have created the impression that the main reason to care about climate change was its impact on the environment.'),
(22107, 82, '"We care about it because it's going to affect nearly every aspect of human life on this planet," she says.'),
(22108, 83, 'A very smart friend of ours tells us that he likes our posts about the problems that are impeding the kind of progress toward explaining and predicting disease that are major goals of genetics and epidemiology.'),
(22109, 83, 'What causes asthma, or heart disease, or obesity, or hypertension or breast cancer?'),
(22110, 83, 'And, how can we know who is going to get these diseases?'),