exam0506_3110_6150.tex 25.1 KB
Newer Older
Loïc Barrault's avatar
Loïc Barrault committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836

\documentclass[12pt]{article}
\usepackage{pslatex}
\usepackage{amsfonts}
\usepackage{newexam+shield}
\usepackage{epsf}

\usepackage{myfancyverb}
\usepackage{examAnswers}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% TOGGLES

%\showAnswers
\hideAnswers

%\mscPaper
\ugPaper

%\input{toggle}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\pressmark{COM3110}
\mscOnly{\pressmark{COM6150}}

\pressmark{COM3110/COM6150}


\department{\bf {DEPARTMENT OF COMPUTER SCIENCE}}
\examtitle{{\bf TEXT PROCESSING}}
\examdate{{\bf Autumn Semester 2005-06}}
\examtime{{\bf 2 hours}}

\setter{Mark Hepple \\ Mark Stevenson}

\rubric{{\bf Answer THREE questions.

All questions carry equal weight.  Figures in square brackets indicate the
percentage of available marks allocated to each part of a question.}}

\mscOnly{\rubric{{\bf Answer the Question in Section A and TWO further questions 
from Section B. 

All questions carry equal weight.  Figures in square brackets indicate the
percentage of available marks allocated to each part of a question.}}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{document}

\begin{exam}

\turnover

\mscOnly{\begin{center}\bf
SECTION A
\end{center}}

\begin{question}

\begin{qupart}
Explain what is meant by the ``bag of words'' model which is used for
  various text processing tasks.
\mypercent{10}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

The bag of words model assumes that text can be represented as a
simple list of the words it contains and, possibly, their frequencies
in the text. Information about the relations between these words and
their position in the text is ignored.

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
Explain how the bag of words model can be used in Information
Retrieval and word sense disambiguation.
\mypercent{15}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

The standard approach to IR models each document as a bag of words,
and identifies relevant documents by comparing queries to documents
using similarity measures that work over such bag-of-words
representations. 

There are various applications in word sense disambiguation including naive
Bayesian classifiers which treat the context of an ambiguous word as a bag
of words and the Lesk algorithm for disambiguation using dictionary
definition overlap treats each definition as a bag of words.

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
Write down Bayes Rule and explain how it is used in the naive
  Bayesian classifier for word sense disambiguation. 
\mypercent{20}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

Bayes rule is: 
$P(A|B) = \frac{P(B|A)P(A)}{P(B)}$

The naive Bayesian classifier aims to estimate the probability of a
sense given the context. i.e $P(s|c)$, where $s$ is a particular sense
and $c$ is the context used by the algorithm. However, this cannot be
estimated directly so Bayes rule is applied and the formula re-written
as: : $\frac{P(c|s)P(s)}{P(c)}$. $P(c)$ is constant for all senses and
can be ignored.  We approximate $P(c|s)$ as
$P(a_1|s)\times\ldots\times{}P(a_n|s)$, where $a_1\ldots a_n$ are the
multiple features that make up 
`context', i.e. making the naive bayesian assumption that these
features are conditionally independent. The probabilities $P(s)$ and
$P(a_i|s)$ can be estimated from training text,

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\answersonly{\newpage}

\begin{qupart}
What are the similarities between part of speech tagging and word
  sense disambiguation? Why are techniques for part of speech tagging
  not suitable for word sense disambiguation?  \mypercent{15}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

Both part of speech tagging and WSD aim to add extra information to
each token in a text (or possibly just each content word). Part of
speech tagging adds syntactic information while WSD selects amongst
word meanings.

Part of speech tagging approaches generally examine a narrow context
around the word being annotated, for example the previous one or two
words. WSD requires a wider context.

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
Write a short Perl program that will read from a file on {\tt STDIN}
and that does nothing with the line that is read {\it until\/} it
encounters a line containing the string {\tt<BEGIN>}. Thereafter, each
subsequent line will be printed (to {\tt STDOUT}) {\it until\/} it
encounters a line containing the string {\tt <END>}. This line and
any that follow are not printed. You may assume that the {\tt<BEGIN>}
and {\tt<END>} strings will occur only once in a file.  \mypercent{20}
\end{qupart}

\begin{SaveVerbatim}{vrb1}
$print=0;
while (<>) {
    if (/<BEGIN>/) {
        $print=1;
    } elsif (/<END>/) {
        $print=0;
    } elsif ($print) {
        print;
    } 
}
\end{SaveVerbatim}

\begin{SaveVerbatim}{vrb2}
while (<>) {
    last if /<BEGIN>/;
} 
while (<>) {
    last if /<END>/;
    print;
} 
\end{SaveVerbatim}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

Could be done in more than one way:

 e.g. as:~~ 
\fbox{~\useVerbStretchFNS{.9}{vrb1}~}
~~ or as:~~ 
\fbox{~\useVerbStretchFNS{.9}{vrb2}~}

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
Assume that, in reading text from an HTML file, we can identify a URL
(`web address') expression {\it within\/} a line of text provided that the string 
meets the following requirements:

\begin{itemize}
\item[i.] the string should fall between double quotes ({\tt "}) 
\item[ii.] it should start with {\tt http://}
\item[iii.] it should end with {\tt.htm} or {\tt.html} 
\item[iv.] between this start and end, there may be any sequence of
  characters except that {\tt "}, {\tt <} and {\tt >} may not appear. 
\end{itemize}

Write a Perl regular expression that will match against strings that
contain a URL expression, with the 
the URL string found being assigned to the variable \verb+$1+.
\mypercent{10}
\end{qupart}

\SaveVerb{vrb1}+/\"(http:[^\"<>]*\.htm(l)?)\"/+

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

\hspace*{40mm}
\UseVerb{vrb1}
\medskip

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{SaveVerbatim}{vrb1}
$_ = 'abcde';
s/(\w)(\w)/$2$1/g;
print "$_\n";
\end{SaveVerbatim}

\begin{SaveVerbatim}{vrb2}
$_ = 'abcde';
while (s/(a)(\w)/$2$1/) { }
print "$_\n";
\end{SaveVerbatim}

\begin{qupart}
Specify what will be printed by each of the following pieces of Perl
code: \useVerbLineCentering{t}

\begin{itemize}
\item[i.] \fbox{~\useVerbStretchFNS{.9}{vrb1}~}
\mypercent{5}
\medskip

\item[ii.] \fbox{~\useVerbStretchFNS{.9}{vrb2}~}
\mypercent{5}
\bigskip
\end{itemize}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

\begin{itemize}
\item[i.] gives: {\tt 'badce'}
\medskip

\item[ii.] gives: {\tt 'bcdea'} -- iteratively transposes the {\tt a}
  down to the end of the string.
\end{itemize}

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\end{question}

%\ugOnly{\medskip}%
\mscOnly{\continued 

\begin{center}\bf%
SECTION B%
\end{center}}%

\begin{question}

\begin{qupart}
Describe Lesk's algorithm for word sense disambiguation using
dictionary definition overlap and explain its
limitations. \mypercent{25}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

Lesk's algorithm relies on the textual definitions of words in a
dictionary to disambiguate words. The pairwise combinations of senses
are considered and the number of words they share in common
counted. The pair of senses with the largest number of words in common
are chosen as the senses. 

To work well the algorithm requires the definitions to be pre-processed
in a number of ways. Empty heads and stop words are removed. (Empty
heads are short expressions generally found at the start of dictionary
definitions which indicate the hypernym of the definition, such as "a
type of".) The remaining words are stemmed. 

The limitations are: 

\begin{enumerate}
\item The approach depends on the correct senses sharing words in their
   definitions. This means that the approach depends on the particular
   dictionary used and will prefer senses with longer definitions over
   shorter ones. 
\item The approach may not be tractable for long sentences; the number
   of sense combinations which have to be checked could be prohibitive. 
\end{enumerate}

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
Describe two techniques which can be used to annotate text with word
sense information automatically without the need for manual
annotation. These techniques are used to provide data which can be
used to train and/or test a word sense disambiguation system. Describe
the disadvantages of each.\mypercent{15}
\end{qupart} 

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

Two techniques are:
\begin{enumerate}
\item Make use of parallel corpora where the sense is defined as the
   translation of a word into another language. 

\item Use pseudowords to automatically introduce ambiguity into
   text. (Pseudowords are created by choosing two or more words,
   e.g. car and bottle, and replacing each occurrence of each with
   their concatenation, car-bottle. The task of the disambiguation
algorithm is to identify the original word.)
\end{enumerate}

Their disadvantages are:
\begin{enumerate}
\item Parallel text may be hard to obtain. 

\item Sense distinctions in pseudowords may not be appropriate for actual
   applications (e.g. translation). The sense distinctions are
   artificial and so an algorithm may learn to disambiguate between
   contexts in which each of the words is likely to occur rather than
   meanings of the words. 
\end{enumerate}

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
What assumption is made by a naive Bayes classifier when it is used for word
sense disambiguation?
\mypercent{10}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

The naive Bayes classifier assumes that the probabilities of the words
in the context of the ambiguous words are conditionally
independent. That is, the presence, or otherwise, of a word in the
context has no influence on other words. 

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\answersonly{\newpage}

\begin{qupart}
Explain the differences between direct, transfer and interlingua
approaches to Machine Translation. \mypercent{30}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

The key difference between the three approaches is the level of
analysis which is applied to the source text. 

Direct approaches apply very little analysis to the the source text
and rely on simple translation of each word in the source text. Statistical 
MT could be considered to be a direct approach. 

Transfer approaches attempt to analyse the structure of the source
text to produce an intermediate representation. The intermediate
representation of a sentence from the input text is then used in
generating the translated sentence in the target language.  Transfer
approach can employ syntactic or semantic representations.

Interlingua approaches rely on a representation of the meaning which
is independent of both the source and target language. The source
sentence goes through syntactic and semantic analysis to be translated
into the interlingua. This representation is then used to generate a
sentence in the target language. The difference between transfer
approaches which use semantic representations and interlingua
approaches rests on the independence of the system used to represent
meaning; interlinguas are completely independent of the source and
target languages while the representation used in semantic transfer simply aims
to capture enough information to allow translation. 

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
Describe the approach used by the BLEU system for evaluation of
Machine Translation systems.\mypercent{20}
\end{qupart} 

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

The BLEU system relies on multiple reference translations, each of
which represents a possible way in which a text could be translated
into a target language. The translation being evaluated (the
candidate) is compared against the reference translations by counting
the number of possible ngrams (strings of words) which occur in
them. BLEU assumes that there are several possible ways to translate a
text, each of which are equally valid, and uses multiple reference
reference translations to provide these alternatives. 

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\end{question}

\ugOnly{\continued}
\mscOnly{\turnover}

\begin{question}

\begin{qupart}
What are stop words, in the context of an Information Retrieval
  system? Why are they generally not included as index terms?
\mypercent{10}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

Stop words are generally words belonging to closed grammatical classes
such as determiners, conjunctions and prepositions. They are
not included as index terms as they occur so frequently in documents
that they are not good discriminators between relevant and irrelevant
documents.

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\answersonly{\newpage}

\begin{qupart}
Consider the following collection of three short documents:\\ 
\\
\hspace{10ex}Document 1: Tyger, tyger, burning bright\\ 
\hspace{10ex}Document 2: Tyger sat on the mat\\
\hspace{10ex}Document 3: The bright mat\\
\\
Show how the documents would be represented in a vector
space model for Information Retrieval, as vectors in which term weights
correspond to term frequencies. Do not remove stop words,
and do not use stemming in creating these representations. 
\mypercent{10}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

The documents would be represented as follows: 
\begin{center}
\begin{tabular}{|cccccccc|}
\hline
           & bright & burning & mat & on & sat & the & tyger\\
Document 1 &  1  &   1  &  0 & 0 &  0 &  0 &   2\\               
Document 2 &  0  &   0  &  1 & 1 &  1 &  1 &  1\\
Document 3 &  1  &   0  &  1 & 0 &  0 &  1 &   0\\
\hline
\end{tabular}
\end{center}

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
The cosine coefficient can be used by Information retrieval systems to
rank the relevance of documents in relation to a query. Compute the
similarity that would be produced between Document 1 and the query
``burning tyger'' using this measure. \mypercent{15}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

\begin{math}
cos(Document 1, \textrm{``burning tyger''}) = \frac{1.0 + 1.1 + 0.0 + 0.0 + 0.0 +
0.0 + 2.1}{\sqrt{1^{2} + 1^{2} +  0^{2} + 0^{2} +  0^{2} + 0^{2} + 2^{2}}\sqrt{1^{2} + 1^{2}}} =
\frac{3}{\sqrt{6}\sqrt{2}} = 0.866
\end{math}

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
Explain what is meant by term weighting in Information Retrieval
  systems and why it is used. \mypercent{15}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

Term weighting is the process of deciding on the importance of each
term and is normally carried out by assigning a
numerical score to each term. 

Term weighting is used in IR systems because not all terms are equally
useful for retrieval; some occur in many of the documents in the
collection (and are therefore bad discriminators) while others occur
infrequently (and will return few documents). Term weighting aims to assign high scores
to terms which
are likely to discriminate between relevant and irrelevant documents. The term
weights are taken into account by the ranking function with the aim of improving
retrieval performance. 

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
Explain how tf.idf term weighting is used to assign weights to terms in
  Information Retrieval. Include the formula for computing tf.idf. \mypercent{15}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

tf.idf assigns weights to terms by taking into account two elements:
the frequency of that term in a particular document and the proportion
of documents in the corpus in which it occurs.  The tf part prefers
terms which occur frequently in a document and the idf part gives
extra weight to terms which do not occur in many documents in the
collection. The tf.idf weight is produced by computing the product of
these two terms.

The formula for computing tf.idf is:

$tf.idf = tf_{ik} \times log_{10}\left(\frac{N}{n_{k}}\right)$

where $tf_{ik}$ is the frequency of term $k$ in document $i$, N is the
total number of documents in the collection and $n_{k}$ the number
which contain the term $k$.

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\answersonly{\newpage}

\begin{qupart}
Show the weights which would be generated if tf.idf weighting was applied to Document 1 
in the document collection of three documents shown above. \mypercent{10}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

Using tf.idf the weights in document 1 would become: 

\begin{center}
\begin{tabular}{|cccccccc|}
\hline
           & bright & burning & mat & on & sat & the & tyger\\
Document 1 & 0.176  &  0.477  &  0  & 0  &  0  &  0  & 0.352\\
\hline
\end{tabular}
\end{center}    

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
Consider the following ranking of ten documents produced by an
Information Retrieval 
  system. The symbol $\checkmark$ indicates that a retrieved document is
  relevant and $\times$ that it is not. 

\begin{center}
\begin{tabular}{|c|c|c|}
\hline
Document & Ranking & Relevant\\
\hline
d6	&  1	& $\checkmark$  \\
d1	& 2	& $\times$  \\
d2	& 3	& $\checkmark$  \\
d10	& 4	& $\times$  \\
d9	& 5	& $\checkmark$  \\ 
d3	& 6	& $\checkmark$  \\
d5	& 7	& $\times$  \\
d4	& 8	& $\times$  \\
d7	& 9	& $\checkmark$  \\
d8	& 10	& $\times$  \\
\hline
\end{tabular}
\end{center}
\vspace{4eX} Compute the (uninterpolated) average precision and
interpolated average precision for this ranked set of
documents. Explain your working.  \mypercent{25}
\end{qupart}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

\begin{center}
\begin{tabular}{|c|c|c|c|c|c|}
\hline
Document & Ranking & Relevant & Recall & Precision & Interpolated Precision\\
\hline
d6  &	 1& 	 $\checkmark$&	  0.2&	 1	&   1\\
d1  &	 2&	 $\times$&	  0.2&	 0.5	&   0.67\\
d2  &    3&	 $\checkmark$&	  0.4&	 0.67	&  0.67\\
d10 & 	 4&	 $\times$&	  0.4&	 0.5	&   0.67\\	
d9  &	 5&	 $\checkmark$&	  0.6&	 0.6	&   0.67\\
d3  &	 6&	 $\checkmark$&	  0.8&	 0.67	&   0.67\\
d5  & 	 7&	 $\times$&	  0.8&	 0.57	&   0.57\\
d4  & 	 8&	 $\times$&	  0.8&	 0.5	&   0.56\\
d7  &	 9&	 $\checkmark$&	  1.0&	 0.56	&   0.56\\
d8  &	 10&	 $\times$&	  1.0&	 0.5	&   0.5\\
\hline
\end{tabular}
\end{center}

(Uninterpolated) average precision is computed by averaging the
precision score each time a relevant document is found. So, (uninterploated) 
average precision is given by $\frac{1 + 0.67 + 0.6 + 0.67 + 0.56}{5}$ = 0.7

Interpolated average precision is computed by taking the interpolated
precision at 11 sample points (0\%, 10\%, 20\%, ... 100\% recall). So,
interpolated average precision is given by: $\frac{(3 \times 1) + (6
  \times 0.67) + (2 \times 0.56)}{11}$ = 0.74

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\end{question}

\ugOnly{\turnover}
\mscOnly{\continued}

\answersonly{\newpage}

\begin{question}
We want to create a Perl program that will identify the tokens
appearing in two text files (ignoring stop words), and then use this
information to compute a measure of the similarity of the two files,
using a metric that is detailed below. 

\begin{qupart}
Write a Perl subroutine {\tt tokenise} which takes a single input
parameter, which is a string (corresponding to a line of input from a
text file), and returns a list of strings for the {\it tokens\/}
appearing in the input. For example, given the input string {\tt
  "Today, John found anti-matter.\verb+\+n"}, the subroutine would return the
list {\tt ("today", "john", "found", "anti", "matter")}. Assume that
tokens are any consecutive sequence of alphabetic characters, with any
non-alphabetic characters (including punctuation) serving to separate
tokens, as the example illustrates. We are not interested in case
distinctions, e.g. {\tt John} vs.\ {\tt john}, so the tokens should be 
mapped to lowercase. 
\mypercent{25}
\end{qupart}

\begin{SaveVerbatim}{vrb1}
sub tokenise {
    return split("[^a-z]+",(lc $_[0]));
}
\end{SaveVerbatim}

\begin{SaveVerbatim}{vrb2}
sub tokenise {
    my ($line) = @_;
    $line =~ tr/A-Z/a-z/;
    $line =~ s/[^a-z]/ /g;
    return split(' ',$line);
}
\end{SaveVerbatim}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

A compact solution is: ~ \fbox{~\useVerbStretchFNS{.9}{vrb1}~}
\vspace*{2mm}

A less terse, but still good, answer might be:~~ 
\fbox{~\useVerbStretchFNS{.9}{vrb2}~}

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
Write a Perl subroutine {\tt readStoplist} that takes as its single
parameter the name of a file that contains a list of stopwords, and
which reads in and stores the stopwords for subsequent use. As part of
your solution, you will need to determine a good data structure for
storing the words, i.e. one allowing them to be efficiently accessed.
Define also a subroutine {\tt isaStopword} which when given a string
for a token, determines whether or not it is a stopword, i.e. so {\tt
  isaStopword(\verb+$+w)} returns {\tt 1} if the value of \verb+$w+ is
a stopword, and returns {\tt 0} otherwise.
\mypercent{25}
\end{qupart}

\begin{SaveVerbatim}{vrb1}
sub readStoplist {
    my ($file) = @_;
    open(STOP,"< $file") 
	|| die "ERR: problem opening file ($file)\n";
    while (<STOP>) {
	chop;
	$stops{$_} = 1;
    }
    close STOP;
}
\end{SaveVerbatim}

\begin{SaveVerbatim}{vrb2}
sub isaStopword {
    my ($w) = @_;
    if ($stops{$w} > 0) {
	return 1;
    } else {
	return 0;
    } 
}
\end{SaveVerbatim}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

A good solution for storing the stopwords is as the keys of a hash,
where the associated value is just {\tt 1} - as this allows us to
check if some word is stored here or not very efficiently. Solutions
where the stopwords are stored in a list, which is traversed to check
the status of some token, will receive lesser credit. 

\hspace*{50mm}
\fbox{~\useVerbStretchFNS{.9}{vrb1}~}

\medskip

Given the use of a hash, this definition has little work to do (mostly
ensuring that {\tt 0} is returned rather than {\tt undef}). If the
stopwords have been stored in a list, then this subroutine will have
more work to do. 

\hspace*{40mm}
\fbox{~\useVerbStretchFNS{.9}{vrb2}~}

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\begin{qupart}
Write a Perl subroutine {\tt countTokens} that takes a file name as
its single parameter, and which counts the occurrences of tokens in
the file, ignoring stopwords, and returns its results as a hash,
i.e. a hash in which each key is a token appearing in the file, for
which the associated value is a positive integer indicating how many
times the token appears. Your definition should use the subroutines
specified in parts (a) and (b) of the question (irrespective of
whether you were able to provide working definitions for them).
\mypercent{25}
\end{qupart}

\begin{SaveVerbatim}{vrb1}
sub countTokens {
    my ($file) = @_;
    my ($w,@wds,%counts);
    open(IN,"<$file")
	|| die "ERROR: problem opening file ($file)\n";
    while (<IN>) {
	@wds = tokenise($_);
	foreach $w (@wds) {
	    unless (isaStopword($w)) {
		$counts{$w}++;
	    }
	}
    }
    return %counts;
}
\end{SaveVerbatim}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

\hspace*{25mm}
\fbox{~\useVerbStretchFNS{.9}{vrb1}~}

\medskip

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\answersonly{\newpage}

\begin{qupart}
Write a Perl {\it program\/} which requires three arguments on the
command line, of which the first is the name of the stopword file, and
the others are the names of two files that are to be compared,
i.e. which might be invoked as: \smallskip

{\centerline{\tt perl myprogram.pl stoplist.txt file1.txt file2.txt}}
\smallskip

The program should use the previously specified subroutines to load
the stopwords, and to count the tokens in the other two files, so that
this information can be used to compute a similarity score for the two
files. The measure to be computed is the proportion of terms
(i.e. distinct tokens) appearing in {\it either\/} file that appear in
{\it both\/} files. Note that this particular measure makes no use of
the specific {\it counts\/} that have been made of token occurrences,
only whether or not a specific distinct token appears in a given file
or not. If $A$ is the {\it set\/} of terms (irrespective of counts)
appearing in one file, and $B$ the corresponding set for the other
file, then this measure can be expressed as:

\[ \frac{\mid A\cap B\mid}{\mid A\cup B\mid} \]

The calculation of this measure can be done either in a subroutine or
in the main body of the code, as you prefer. Your program should print
the similarity value of the two files to {\tt STDOUT} before
terminating. 
\mypercent{25}
\end{qupart}

\begin{SaveVerbatim}{vrb1}

@ARGV == 3
    || die "ERROR: must specify 3 input files\n";

($stopFile,$file1,$file2) = @ARGV;

readStoplist($stopFile);
%counts1 = countTokens($file1);
%counts2 = countTokens($file2);

foreach $w (keys %counts1) {
    $allTokens{$w}=1;
}
foreach $w (keys %counts2) {
    $allTokens{$w}=1;
}

$inBothCount=0;
foreach $w (keys %allTokens) {
    if ($counts1{$w} * $counts2{$w} > 0) {
	$inBothCount++;
    }
}

if (0 < (keys %allTokens)) {
    printf "Similarity: %.2f\n", $inBothCount/(keys %allTokens);
} else {
    print "Similarity: 0\n";
}
\end{SaveVerbatim}

\answer{%%%%%% **** BEGIN ANSWER **** %%%%%%

\hspace*{25mm}
\fbox{~\useVerbStretchFNS{.9}{vrb1}~}

\medskip

} %%%%%%%%%%%%% **** END ANSWER **** %%%%%%

\end{question}

\questionsend
\end{exam}


\end{document}