exam3110_6150_0708.tex 46.9 KB
Newer Older
Loïc Barrault's avatar
Loïc Barrault committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288

\documentclass[12pt]{book}

\usepackage{tuos_exam}
\usepackage{amsfonts}
\usepackage{amsmath}

%\showanswers % *** TOGGLE IN/OUT TO SHOW/HIDE ANSWERS *** 

\pressmark{COM3110/COM6150}
\pressmark{COM3110}
\pressmark{COM6150}
\department{{\bf DEPARTMENT OF COMPUTER SCIENCE}}
\examtitle{{\bf TEXT PROCESSING}}
\examdate{{\bf Autumn Semester 2007-08}}
\examtime{{\bf 2 hours}}
% \dataProvided{Deduction Rules}
\setter{Mark Hepple}
\rubric{{\bf Answer THREE questions.

All questions carry equal weight. Figures in square brackets indicate the
percentage of available marks allocated to each part of a question.}}

%\doNotRemove %% MODIFIES FRONT PAGE TO "DO NOT REMOVE FROM HALL" FORMAT

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newcommand{\stretchit}[1]{\addtolength{\itemsep}{#1mm}}

\newcommand{\argmax}{\operatornamewithlimits{argmax}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{document}

\begin{exam}

\begin{question}
\begin{qupart}
What is {\it part-of-speech-tagging\/}? Explain why this task is difficult,
i.e. why it requires something more than just simple dictionary
look-up. 
\mypercent{20}

\begin{answer}
  This is the task of assiging to each word (or more generally token)
  in texts, a tag or label for the word's part of speech class, such
  as noun, verb or adjective. These classes group together words that
  exhibit similar disributional behaviour, i.e. which play similar
  roles in the syntax of the language. The task is non-trivial due to
  (i)~ambiguity, i.e. many words have more than one POS, and the
  correct one must be selected (on the basis of local context), and
  because (ii) it is common to encounter unknown words (i.e. ones not
  present in the dictionary, even if this has been compiled from a
  large amount of text), for which a POS tag must be `guessed' (using
  context and morphological information).

%  Ambiguity is resolved by exploiting context (generally local
%  context), i.e. since different candidate tags are differentially
%  likely to occur in different contexts. Tag guessing for unknown
%  words also exploits context information, together with morphological
%  analysis, e.g. that the word ends with a char sequence that is a
%  morphological suffix typical of a given word class.

\end{answer}
\end{qupart}

\begin{qupart}
Explain what is meant by the {\it bag of words\/} model in text
processing. Discuss the relevance of this model to work on {\it
  information retrieval}. \mypercent{20} 

\begin{answer}
  The bag of words model assumes that a text can be represented simply
  in terms of the words it contains and, possibly, their frequencies
  in the text. Information about the relations between these words and
  their position in the text is ignored. Standard IR models (e.g. the
  vector-space model) treat documents and queries as bags-of-words,
  i.e. they use similarity/etc measures for comparing documents and
  queries that rely only on the words that appear in them and
  (possibly) their frequencies. Accordingly, IR indexing methods
  provide efficient encodings of bag-of-words type information on
  documents, discarding information about the order of words within
  them.
\end{answer}
\end{qupart}

\begin{qupart}
  Write down the Bayes Rule, and briefly explain what it means. State
  the key assumption that is made in a Naive Bayesian approach, and
  show how this allows the rule to be reformulated. \mypercent{20}

\begin{answer}
Bayes rule is: 
$P(A|B) = \frac{P(B|A)P(A)}{P(B)}$

In text processing (and other ML) contexts, the rule is commonly used
to determine the probability of some hypothesis $h$ on the basis of 
some data or evidence that can be decomposed into a series of
subcomponents or features  $f_1\ldots f_n$, i.e. so the formula might
be restated as:  

\[ P(h|f_1\ldots f_n) = \frac{P(f_1\ldots f_n|h)P(h)}{P(f_1\ldots
  f_n)} \]

The Naive Bayesian assumption is that the features are conditionally
independent (or that treating them as such is an acceptable
approximation), allowing the formula to be rewritten as:

\[ P(h|f_1\ldots f_n) =
\frac{P(h)\prod_{i=1}^n P(f_i|h)}{\prod_{i=1}^n P(f_i)} \]

The probabilities appearing in this formula can in general be more
readily estimated from training data than those in the preceding
formula (due to problems of data sparseness).
\end{answer}
\end{qupart}

\begin{qupart}
  Write a short Perl program that reads from a file (on {\tt
    STDIN}), and which prints (to {\tt STDOUT}) any line of the
  input file that contains the words ``secret mission'' (appearing
  together in this order) {\it irrespective\/} of their capitalisation
  (i.e. so that occurrences   of e.g. ``Secret~ Mission'' and
  ``SECret~~ MiSsiON'' should count as 
  matches). Other lines from the input file should be ignored. 
\mypercent{20}

\begin{answer}

\begin{verbatim}
while (<>) {
   if (/\bsecret\s+mission\b/i) {
      print;
   }
}
\end{verbatim}

\end{answer}
\end{qupart}

\answersonly{\newpage}

\begin{qupart}
  Write a short Perl program that reads a file (on {\tt STDIN})
  containing an unsorted list of words (one word per line), and
  prints those words out (to {\tt STDOUT}) in {\it alphabetically
    sorted order}. \mypercent{20} 

\begin{answer}

\begin{verbatim}
while (<>) {
    if (/^(\w+)\s*$/) {
        push @words, $1;
    }
}
foreach $w (sort @words) {
    print ">> $w\n";
}
\end{verbatim}

\end{answer}
\end{qupart}


\end{question}

\bigskip

\answersonly{\newpage}

\answersonly{\newpage}

\begin{question}
\begin{qupart}
Explain the differences between direct, transfer and interlingua
approaches to Machine Translation. \mypercent{30}

\begin{answer}
The key difference between the three approaches is the level of
analysis which is applied to the source text. 

Direct approaches apply very little analysis to the the source text
and rely on simple translation of each word in the source
text. Statistical MT could be considered to be a direct approach.

Transfer approaches attempt to analyse the structure of the source
text to produce an intermediate representation. The intermediate
representation of a sentence from the input text is then used in
generating the translated sentence in the target language.  Transfer
approach can employ syntactic or semantic representations.

Interlingua approaches rely on a representation of the meaning which
is independent of both the source and target language. The source
sentence goes through syntactic and semantic analysis to be translated
into the interlingua. This representation is then used to generate a
sentence in the target language. The difference between transfer
approaches which use semantic representations and interlingua
approaches rests on the independence of the system used to represent
meaning; interlinguas are completely independent of the source and
target languages while the representation used in semantic transfer
simply aims to capture enough information to allow translation.
\end{answer}
\end{qupart}

\begin{qupart}
Describe the main features of the approach to statistical machine
translation developed by the IBM research group. 
\mypercent{40}

\begin{answer}
This is an essay-type question. We list here  general points that
  the students should discuss. 

\bigskip

Machine translation can be cast in terms of the noisy-channel
model. The approach was pioneered by IBM in the early 90's. If we want
to translate French into English, we shall assume that someone has $E$
in his head but by the time he writes it down it's {\it corrupted} and
becomes $F$. To recover the original $E$, need to reason about:
(a)~the kinds of things people say in English and (b)~how English gets
turned into French. Given a French sentence $F$ the approach searches
for English $E$ that maximises $P(E|F)$.

\[
\begin{array}{lll}
\argmax_{E}P(E|F) &  = & \argmax_{E} \frac{P(E) \cdot P(F|E)}{P(F)} \\
                   
                  & =  &  \argmax_{E} P(E) \cdot P(F|E)  \\
\end{array}
\]

We choose not to model $P(E|F)$ directly. Instead, we break things
apart to get good translations even if the probability numbers are not
that accurate. $P(F|E)$ will ensure that a good $E$ will have words
that generally translate to words in $F$. $P(E)$ will help save us
from bad English and $P(F|E)$ only needs to say whether a bag of
English words corresponds to a bag of French words.  $P(E)$ and
$P(F|E)$ can be trained independently.

In order to model $P(F|E)$ we need a sentence sentence-aligned
parallel (bilingual corpus) and a model of translation $\sim P(F|E)$.
The latter assumes word alignments which we do not have. We can
approximate those by using EM, an unsupervised learning algorithm that
can learn alignment probabilities.
\end{answer}
\end{qupart} 

\begin{qupart}
  Do you believe that the approach cited in (b) is suitable for use
  with any pair of languages for which a large enough corpus of text
  is available? Justify your answer.  \mypercent{10}

\begin{answer}
The approach is limited to closely related language pairs. It
  will be difficult to translate languages that are structurally very
  different (e.g.,~languages that have different notions of what
  counts as a word or languages that have radically different word
  orders). Answers should explain why this presents a problem for the
  model and suggest examples of pairs of languages that are
  similar/different in the relevant manner. 
\end{answer}
\end{qupart} 

\begin{qupart}
Describe the approach used by the BLEU system for evaluation of
Machine Translation systems.\mypercent{20}

\begin{answer}
The BLEU system relies on multiple reference translations, each of
which represents a possible way in which a text could be translated
into a target language. The translation being evaluated (the
candidate) is compared against the reference translations by counting
the number of possible ngrams (strings of words) which occur in
them. BLEU assumes that there are several possible ways to translate a
text, each of which are equally valid, and uses multiple reference
reference translations to provide these alternatives. 
\end{answer}
\end{qupart} 

\end{question}


\newpage

\begin{question}
\begin{qupart}
  Explain what is meant by the use of (i) a {\it stoplist\/} and (ii)
  {\it stemming\/} in the context of information retrieval. Discuss
  the potential benefits of using these methods for an information
  retrieval system. \mypercent{{20}}

\begin{answer}
  A {\it stoplist\/} is a list of words (`stop-words') that are
  ignored when documents are indexed, and likewise discounted from
  queries. These are words that are so widespread in the document
  collection (i.e.  have a high document frequency) that they are of
  v.little use for discriminating between documents that are/are
  not relevant to a query. Whilst their inclusion would not contribute
  to effective retrieval (and may interfere with this), their
  exclusion eliminates a large number of term occurrences (as
  stop-words are frequent terms) that would need to be recorded,
  thereby reducing the size of indexes and saving computational effort
  during both indexing and retrieval. 

\medskip

{\it Stemming\/} refers to the process of reducing words that are
morphological variants to their common root or stem, e.g. so that
variants {\it computer, computes, computed, computing}, etc are
reduced to the stem {\it compute\/} (or some surrogate for the real
stem, such as {\it comput\/}). When stemming is used in IR, it is
applied to documents before indexing so that indexes record
occurrences of stems (or strictly occurrences of words that reduce to
some stem), and is applied to queries also before retrieval.  The
effect of this move is that when a query contains a term such as {\it
  computing}, retrieval can potentially return documents on the basis
of their containing any of the morphological variants of the same
root. This is the intended key benefit of using stemming, although its
usefulness is debated. Stemming will also produce some reduction in
the size of document indexes. 
\end{answer}
\end{qupart}

\begin{qupart}
  Explain what is meant by an {\it inverted index}, and why such
  indices are important in the context of information retrieval. Show
  how Documents 1 and 2 below would be stored in an inverted index,
  using stemming and a stoplist of your choice.  \mypercent{25}

\medskip

\begin{quote}\small
{\bf Document 1}:~ She sells sea shells on the sea shore. 
\medskip

{\bf Document 2}:~ Many shells and sea creatures are sold at the
market. 
\end{quote}

\begin{answer}
  An inverted index is a data structure that lists for each word in a
  document collection all documents that contain it and the frequency
  of occurrence in each document. An inverted index makes it easy to
  search for `hits' of a query word, i.e.  one just goes to the part
  of the inverted index that corresponds to the query word to get a
  list of document identifiers for documents containing the word.  A
  more sophisticated version of an inverted index might also record
  position information for occurrences (encoded as a byte offset
  relative to the beginning of the document), to allow for efficient
  retrieval of phrases.  

  We adopt the stoplist: \{she, on, the, many, and, are, at\} on the
  grounds that these are very common words that will serve as poor
  discriminators. Given this stoplist and stemming (which reduces
  e.g. both {\it sells\/} and {\it sold\/} to {\it sell\/}), and
  assuming that there are no other documents, so that only terms in
  these document need be considered, we get the following inverted
  index:

\begin{table}[h!]
\begin{center}
\begin{tabular}{|c|l|l|}\hline
\it term-id & word & doc\\ \hline 
1  & creature &  d2:1 \\
2  &  market  &  d2:1 \\
3  &  sea     &  d1:2, d2:1 \\
4  &  sell    &  d1:1, d2:1 \\
5  &  shell   &  d1:1, d2:1 \\
6  &  shore   &  d1:1 \\ \hline
\end{tabular}
\end{center}
\end{table}

Here, entries record frequency counts, e.g. the entry for {\it sea\/}
tells us that the term appears twice in Document 1 (d1:2) and once in
Document 2 (d2:1). 
\end{answer}
\end{qupart}

\begin{qupart}
  Calculate the similarity of Documents 1 and 2 using a measure of
  choice. Explain your measure and justify its choice.  \mypercent{25}

\begin{answer}
An obvious choice is to follow the vector space model and treat the
documents as vectors of occurrences of terms from the above index
(i.e. in the order used there, and ignoring stop-words), as follows: 

\begin{quote}
Doc-1: $\left<0, 0, 2, 1, 1, 1\right>$

Doc-2: $\left<1, 1, 1, 1, 1, 0\right>$  
\end{quote}


and also to use the cosine between these vectors as the measure of
their similarity, i.e.: 

\[
cos(\vec{x},\vec{y}) = \frac{\vec{x} \cdot \vec{y}}{|\vec{x}| |\vec{y}|} =
\frac{\sum^{n}_{i=1}x_iy_i}{\sqrt{\sum^{n}_{i=1}x^2} \sqrt{\sum^{n}_{i=1}y^2}}
\]

\[
= \frac{0.1+0.1+2.1+1.1+1.1+1.0}{%
\sqrt{0^2+0^2+2^2+1^2+1^2+1^2}.\sqrt{1^2+1^2+1^2+1^2+1^2+0^2}}
\]

\[ =~~ \frac{4}{\sqrt{7}\sqrt{5}} ~~=~~ \ldots\] 

\end{answer}
\end{qupart}

\medskip

Assume that we have a collection of documents which are stored in a
{\it single\/} text file, in the manner shown in the box below. Note
that the file includes XML-style tags, which mark the beginning and
end of each document and also its identifier number. 

\bigskip

\hspace*{25mm}
\fbox{\rule{0mm}{4mm}~\parbox[t]{90mm}{\tt\small
<document docid=1>\\
Mary had a little lamb. \\
Its fleece was white as snow. \\
</document>\\
<document docid=2>\\
Little Miss Muffet, sat on her tuffet. \\
</document>\\
\hspace*{12mm}\vdots\\
<document docid=$N$\/>\\
Mary, Mary, quite contrary. \\
How does your garden grow?\\
</document>\\[-3mm]
}}
\medskip

\begin{qupart}
  Write a perl program which will read a file which is in this format
  and which will compute an inverted index. (NOTE: The requirement
  here is only that you should compute the information required for an
  inverted index and store it within an appropriate data
  structure. You do not need to print the index out to a file.)  You
  may decide for yourself whether to record term frequency information
  or not. Let the definition of {\it term\/} be any (maximal)
  alphabetic sequence, i.e. do not try to accommodate terms that
  contain any non-alphabetic characters. Do not use a stoplist or
  stemming.  \mypercent{30}

\begin{answer}

Here is a simple solution, which stores the info via variable 
{\tt\$index} which is a reference to a hash (keyed on token) of
hashes (mapping doc ids to term counts): 

\begin{verbatim}
while (<>) {
    chop;
    if (/^\s+$/) {
        next;
    } elsif (/^<document\s+docid=(\d+)>\s*$/) {
        $docnum=$1;
    } elsif (/^<\/document>\s*$/) {
        next;
    } else {
        @tokens = split /[^\w]+/;   # split on non-empty sequence 
                                    # of non-alphabetic chars
        foreach $token (@tokens) {
            next if $token eq '';        # skip any 'null' tokens
            $index->{$token}{$docnum}++; # count term/doc occurrence
        }
    }
}
\end{verbatim}

\end{answer}
\end{qupart}

\begin{comment}
\begin{qupart}
  Write some further Perl code, which can be added to your program
  from (d), and which will print the inverted index information that
  has been computed out to a file. The information should be printed
  in a manner such that it could be later reloaded. (You do not need
  to write code for this reloading.)
\mypercent{15}

\begin{answer}

\begin{verbatim}
foreach $token (sort keys %{$index}) {
    print $token;
    foreach $doc (sort keys %{$index->{$token}}) {
	$count = $index->{$token}{$doc};
	print " d$doc:$count";
    }
    print "\n";
}
\end{verbatim}

\end{answer}
\end{qupart}  
\end{comment}

\end{question}

\newpage

\begin{question}
\begin{qupart}
  Explain what is meant by {\it summarisation\/}?  List and explain a
  number of the ways in which different kinds of summary have
  conventionally been subdivided.  \mypercent{20}

\begin{answer}
  In its most general sense, summarisation is the art of abstracting
  key content from one or more information sources. This has most
  commonly been done for textual sources (e.g. newpaper articles,
  journal papers), but this is not necessarily so, e.g. a textual 
  summary can be provided of the events in video material (such 
  as a football match). 

Standard ways in which summaries have been distinguished are as
follows. Answers should explain the terms used. 

\begin{enumerate}
\item abstract vs.\ extract

\item indicative vs.\ informative vs.\ evaluative (critical) summaries

\item single vs.\ multi-document summaries

\item generic vs.\ query-based vs. user-profile-based summaries

\item summary for expert vs.\ novice users
\end{enumerate}
\end{answer}
\end{qupart}

\begin{qupart}
Explain what is meant by `deep' and `shallow' approaches to
automatic text summarisation, and discuss their differences. 
\mypercent{10}

\begin{answer}
Deep approaches:

\begin{itemize}
\item are knowledge-rich
\item involve a significant extent of `semantic analysis'
\item employ a sizeable knowledge-base of rules that are costly to
  create/maintain
\item are domain dependent
\item are difficult/costly to adapt to new domains
\item produce abstracts
\end{itemize}

By contrast, shallow approaches: 

\begin{itemize}
\item use an analysis based primarily on words and other shallow
  document features
\item no significant `semantic analysis'
\item are knowledge-poor (or `-light')
\item have little domain dependence and so are easily ported to new
  domains
\item produce extraction-based summaries
\end{itemize}
\end{answer}
\end{qupart}

\begin{qupart}
Discuss the criteria that have been used by {\it shallow\/} approaches
to automated summarisation for identifying text segments containing
significant information in a document. 
\mypercent{20}

\begin{answer}
Methods used include

\begin{itemize}
\item {\bf keywords}: 
  \begin{itemize}
  \item assume frequent content words indicate topic of document
  \item identify set of keywords by (e.g.) TF.IDF criterion
  \item score sentences w.r.t. occurrence of keywords within them
  \end{itemize}

\item {\bf title method}:
  \begin{itemize}
  \item assume title/subheadings indicative of important content
  \item use words from them to identify useful sentences
  \end{itemize}

\item {\bf position}: 
  \begin{itemize}
  \item use position in document as indicator of importance of text
    segment
  \item notable positions include: (i) being close to start of text,
    (ii) in certain sections (e.g. intro/conclusion), (iii) first
    sentence of paragraph, etc.
  \end{itemize}

\item {\bf cue method}: 
  \begin{itemize}
  \item certain phrases may signal important content (e.g. ``this
    paper presents \dots'')
  \item may distinguish both positive and negative indicators, as
    `bonus' and `stigma' words/phrases
  \end{itemize}
\end{itemize}
\end{answer}
\end{qupart}

\begin{qupart}
  Describe the document summarisation approach of Kupiec et al.\
  (1995).  Your answer should address: (i) the nature of their
  training corpus, (ii) the features used as indicators of useful
  sentences, and (iii) how their system determines whether or not to
  include a sentence in the summary.  \mypercent{30}

\begin{answer}
The system presented by Kupiec et al (1995) is a modified Naive
  Bayes classifier. 

{\bf Corpus}: Their training corpus consists of document-summary
  pairs from a number of scientific journals. Before training, they
  run a sentence matching algorithm to determine which sentences in the
  document correspond to sentences in the summary. Instead of doing
  sub-sentence information extraction, the system concentrates on
  sentence selection, so sentences in the training data are marked as
  being part of the summary or not. The system examines a
  set of features for each sentence, all of which are discrete: 

{\bf Features}: 

  \begin{itemize}
  \item {\bf Sentence Length Cut-off Feature:} Short sentences tend to not be
  included in summaries. Sentences longer than a certain threshold
  have a value of true, and the others are false.

  \item {\bf Fixed-Phrase Feature:} Sentences containing or following
  a number of key phrases tend to be included in summaries. Kupiec et
  al. use 26 indicator phrases. Sentences containing those phrases, or
  ones following section heads with key words, have a value of
  true. All other sentences have a value of false.

  \item {\bf Paragraph Feature:} Sentences are categorized as being
  the initial sentence in a paragraph, a medial sentence, or a final
  sentence.

  \item {\bf Thematic Word Feature:} The most frequent content words
  are designated theme words. Sentences are ranked based on the
  frequency of theme words. This feature is binary, depending on
  whether a sentence is among the highest ranked sentences.

  \item {\bf Uppercase Word Feature:} Proper names and acronyms tend
  to be important. This feature is similar to the previous one, with
  sentences ranked according to the frequency of uppercase words that
  appear more than once in the document. 
\end{itemize}

{\bf Selection}: 
  For each sentence $s$, the system determines the probability that it
  will be included in a summary $S$, given features $F_{1},  \ldots,F_{k}$ using
  a Bayes decision rule: 

\[
P(s \in S|F_1,F_2, \ldots Fk) = P(F_1, F_2 \ldots F_k|s \in S) \times
\frac{P(s \in S)}{P(F_1, F_2, \ldots F_k)}
\]

 Assuming that the features are
  statistically independent, the above becomes: 
   
\[
\frac{P(s \in S)\prod_{j=1}^{k}{P(F_{j}| s \in
    S)}}{\prod_{j=1}^{k}{P(F_{j})}}
\]

Thus, the system is essentially a naive Bayes classifier.  $P(s \in
S)$ is constant, and $P(F_j |s \in S)$ and $P(F_j)$ are estimated from
the training set by counting occurrences.

\end{answer}
\end{qupart} 

\answersonly{\newpage}

\begin{qupart}
Discuss alternative approaches to evaluating the performance of
automatic summarisation systems. 
\mypercent{20}

\begin{answer}
Evaluation may be intrinsic (i.e. address the quality of the summary
itself) or extrinsic (assess impact of subjects using the summary on
their performanc at some task).

Tasks used in extrinsic evaluations include the subject's ability to
answer questions about topic of original text, or to assign a topic
category to the original text, when given the automatic summary, as
opposed to using the original text or a human summary. 

Intrinsic evaluation may be done by human judgement (e.g. of whether
key topics of original text are present, whether summary is coherent)
or be done automatically. Human judgements are subjective, and often
vary between judges, and approach is time-consuming (and therefore
costly). Automatic evaluation requires the creation of gold standard
materials, i.e. collections of documents with associated reference
summaries (which are costly to create). It primarily addresses whether
the content of reference summaries is represented in automated
summaries, not whether the latter are coherent. For sentence
extraction approaches, evaluation against extraction-based reference
summaries may be done using measures such as precision and
recall. This is not possible for abstraction approaches. Alternative
approaches measure textual overlap between automatic and reference
summaries, as an indicator of conceptual overlap. A key approach is
the ROUGE system, which measures (a version of) word n-gram overlap
between automatic and reference summaries.
\end{answer}
\end{qupart}
\end{question}

\questionsend

\end{exam}
\end{document}

\newpage

\subsection*{Machine Translation}

\begin{question}

\begin{qupart}
Explain the differences between direct, transfer and interlingua
approaches to Machine Translation. \mypercent{30}

\begin{answer}
The key difference between the three approaches is the level of
analysis which is applied to the source text. 

Direct approaches apply very little analysis to the the source text
and rely on simple translation of each word in the source
text. Statistical MT could be considered to be a direct approach.

Transfer approaches attempt to analyse the structure of the source
text to produce an intermediate representation. The intermediate
representation of a sentence from the input text is then used in
generating the translated sentence in the target language.  Transfer
approach can employ syntactic or semantic representations.

Interlingua approaches rely on a representation of the meaning which
is independent of both the source and target language. The source
sentence goes through syntactic and semantic analysis to be translated
into the interlingua. This representation is then used to generate a
sentence in the target language. The difference between transfer
approaches which use semantic representations and interlingua
approaches rests on the independence of the system used to represent
meaning; interlinguas are completely independent of the source and
target languages while the representation used in semantic transfer
simply aims to capture enough information to allow translation.
\end{answer}
\end{qupart}

\answersonly{\newpage}

\begin{qupart}
Describe the approach used by the BLEU system for evaluation of
Machine Translation systems.\mypercent{20}

\begin{answer}
The BLEU system relies on multiple reference translations, each of
which represents a possible way in which a text could be translated
into a target language. The translation being evaluated (the
candidate) is compared against the reference translations by counting
the number of possible ngrams (strings of words) which occur in
them. BLEU assumes that there are several possible ways to translate a
text, each of which are equally valid, and uses multiple reference
reference translations to provide these alternatives. 
\end{answer}
\end{qupart} 

\begin{qupart}
Describe the main features of the approach to statistical machine
translation adopted by the IBM research group. Justify your answers.
\mypercent{50}

\begin{answer}
This is an essay-type question. We list here  general points that
  the students should discuss. 

\bigskip

Machine translation can be cast in terms of the noisy-channel
model. The approach was pioneered by IBM in the early 90's. If we want
to translate French into English, we shall assume that someone has $E$
in his head but by the time he writes it down it's {\it corrupted} and
becomes $F$. To recover the original $E$, need to reason about:
(a)~the kinds of things people say in English and (b)~how English gets
turned into French. Given a French sentence $F$ the approach searches
for English $E$ that maximises $P(E|F)$.

\[
\begin{array}{lll}
\argmax_{E}P(E|F) &  = & \argmax_{E} \frac{P(E) \cdot P(F|E)}{P(F)} \\
                   
                  & =  &  \argmax_{E} P(E) \cdot P(F|E)  \\
\end{array}
\]

We choose not to model $P(E|F)$ directly. Instead, we break things
apart to get good translations even if the probability numbers are not
that accurate. $P(F|E)$ will ensure that a good $E$ will have words
that generally translate to words in $F$. $P(E)$ will help save us
from bad English and $P(F|E)$ only needs to say whether a bag of
English words corresponds to a bag of French words.  $P(E)$ and
$P(F|E)$ can be trained independently.

In order to model $P(F|E)$ we need a sentence sentence-aligned
parallel (bilingual corpus) and a model of translation $\sim P(F|E)$.
The latter assumes word alignments which we do not have. We can
approximate those by using EM, an unsupervised learning algorithm that
can learn alignment probabilities.

{\bf 30\% for explaining the noisy channel model, 10\%
    for writing down formulas that express how French gets converted
    into English, 10\% for explaining how probabilistic models of
    machine translation are trained (using aligned corpora and EM).}

\end{answer}
\end{qupart} 

\begin{qupart}
To what extent is this approach purely statistical and to what extent
does it rely on linguistic intuitions?  Justify your answers.
\mypercent{25}

\begin{answer}
The approach as described above does not rely on linguistic
knowledge about the two languages.
  
{\bf 10\% for discussing why the statistical MT
    approach is purely data driven, 15\% for explaining how parallel
    corpora are used to provide information about correspondences
    between two languages.}
\end{answer}
\end{qupart} 

\begin{qupart}
Do you believe that the approach
is limited to closely related language pairs such as English and
French, or will it be applicable to any language pair for which a
large enough corpus of text is available? Justify your  answers. 

\begin{answer}
The approach is limited to closely related language pairs. It
  will be difficult to translate languages that are structurally very
  different (e.g.,~languages that have different notions of what
  counts as a word or languages that have radically different word
  orders).
  
{15\%for explaining why the statistical approach to MT assumes that
  the two languages are structurally similar. 10\% for coming up with
  examples of language pairs for which the approach would be
  problematic.}
\end{answer}
\end{qupart} 

\begin{qupart}
What is the noisy channel model? Given an example of a text
processing application that is based on the noisy channel model. As
part of your answer draw a picture of the noisy channel model. 
\mypercent{20}

\begin{answer}
Using information theory, Shannon modeled the goal of
  communicating down a telephone line -- or in general across any
  channel -- in the following way. Imagine communication along a
  low-quality telephone line where the sender creates a message which
  is being transmitted over a channel with limited capacity; the
  receiver has to guess what the original message was. It is assumed
  that the output of the channel depends probabilistically on the
  input. The goal is to encode the message in such a way that it
  occupies minimal space while still containing enough redundancy to
  be able to detect and correct errors. On receipt, the message is
  decoded to give what was most likely the original message.

\[ 
 \stackrel{W}{\longrightarrow}\fbox{Encoder}\stackrel{X}{\longrightarrow}\fbox{\parbox{1in}{Noisy
     Channel  \center{$p(y|x)$}}} 
    \stackrel{Y}{\longrightarrow}\fbox{Decoder}
    \stackrel{\hat{W}}{\longrightarrow}  
\]

A simplified version of the noisy channel model has been used to model
the machine translation process. As an example, suppose we want to
translate a text from English to French. The channel model for
translation assumes that the true text is French, but that
unfortunately when it was transmitted to us, it went through a noisy
communication channel and came out as English. All we need to do in
order to translate is to recover the original French, i.e., to decode
the English to get the French.
\end{answer}
\end{qupart} 

\begin{qupart}
When using the noisy channel model for machine translation, we
  estimate $\argmax\limits_E P(E | F)$ by rewriting using Bayes rule
  to produce $\argmax\limits_E P(E) P(F | E)$. Why do we do this? What
  would happen if we simply directly modeled $P(E | F)$ and did
  without the language model $P(E)$? Give one or more reasons why the
  rewritten model should produce better English
  translations. \mypercent{25} 

\begin{answer}
Machine translation can be cast in terms of the noisy-channel
  model. The approach was pioneered by IBM in the early 90's. If we
  want to translate French into English, we shall assume that someone
  has $E$ in his head but by the time he writes it down it's
  {\it corrupted} and becomes $F$. To recover the original $E$, we
  need to reason about: (a)~the kinds of things people say in English
  and (b)~how English gets turned into French. Given a French sentence
  $F$, the approach searches for English $E$ that maximises $P(E|F)$.

\[
\begin{array}{lll}
\argmax\limits_{E}P(E|F) &  = & \argmax\limits_{E} \frac{P(E) \cdot P(F|E)}{P(F)} \\
                   
                  & =  &  \argmax\limits_{E} P(E) \cdot P(F|E)  \\
\end{array}
\]

We choose not to model $P(E|F)$ directly. Instead, we break things
apart to get good translations even if the probability numbers are not
that accurate. $P(F|E)$ will ensure that a good $E$ will have words
that generally translate to words in $F$. $P(E)$ will help save us
from bad English and $P(F|E)$ only needs to say whether a bag of
English words corresponds to a bag of French words.  $P(E)$ and
$P(F|E)$ can be trained independently.
\end{answer}
\end{qupart} 

\begin{qupart}
  Consider a simple translation model in which we simply map words
  from one language to another, and allow words in any position to be
  equally likely to appear in any position in the translation. In
  effect, the translation model makes every ordering of the translated
  words equally likely. For example, the French sentence {\it Je
    t'aime} would produce equally likely sequences {\it I you love},
  {\it I love you}, {\it you I love}, {\it you love I}, {\it love
    I you}, and {\it love you I}. Could we still produce reasonable
  translations using this translation model. Why? In what cases would
  it tend to fail? Give an example.
\mypercent{25}

\begin{answer}
Such a model would produce several ungrammatical translations.
  In fact, there would be a large number of English translations
  corresponding to a single French sentence. The model would need to
  be complemented with a mechanism that picks one translation from the
  set of possible translations. Since every ordering is equally
  likely, the selection process cannot be done probabilistically. One
  solution would be to select a random order. This of course entails
  that we are not guaranteed to have a good or even grammatical
  translation. The model will fail to capture the fact that there is
  no one-to-one mapping from one language to another. Several words in
  the source language may be rendered by a single word in the target
  language. Furthermore, even in cases where there is a direct
  mapping, the order of words may differ from one language to the
  other. The naive model fails on both accounts. 
\end{answer}
\end{qupart} 

\end{question}

\newpage

\subsection*{Text Summarisation}

\begin{question}

\begin{qupart}
One of the most popular applications of text processing is
automatic summarisation. Briefly describe why automatic
summarisation is a challenging task. What information should an
automatic summariser take into account so that it produces good
summaries? What are the limitations of methods based solely on
sentence extraction? 
\mypercent{30}

\begin{answer}
Summarization -- the art of abstracting key content from one or
  more information sources -- has become an integral part of everyday
  life. People keep abreast of world affairs by listening to news
  bites. They base investment decisions on stock market updates. They
  even go to movies largely on the basis of reviews they've seen. With
  summaries, they can make effective decisions in less time. Although
  some summarizing tools are already available, with the increasing
  volume of online information, it is becoming harder to generate
  meaningful and timely summaries. Tools such as Microsoft s
  AutoSummarize option in Office 97, are useful, but their application
  is limited to \emph{extraction} selecting original pieces from the
  source document and concatenating them to yield a shorter text.
  \emph{Abstraction}, in contrast, paraphrases in more general terms
  what the text is about.
  
  The concatenation approach to extraction does little to ensure that
  the summary is coherent, which can make the text hard to read.
  Moreover, the source may not always have text for example, a sports
  event on videotape or tables displaying economic data and current
  tools cannot summarize non-textual media.  Finally, these tools do
  not currently handle multiple sources. For example, there may be
  many stories on the Web about a particular news event, and it would
  be useful if the summarizer could capture common and new
  information.  To address these limitations, researchers are looking
  at a variety of approaches, which roughly fall into two categories.
  Knowledge-poor approaches rely on not having to add new rules for
  each new application domain or language.  Knowledge-rich approaches
  assume that if you grasp the meaning of the text, you can reduce it
  more effectively, thus yielding a better summary. They rely on a
  sizeable knowledge base of rules, which must be acquired,
  maintained, and then adapted to new applications and languages. The
  two classes of methods are not necessarily mutually exclusive. In
  fact, some approaches use a hybrid.
  
  In both methods, the main constraint is the compression requirement.
  Extracts of single documents usually aim to be five to 30 percent of
  the source length.  However, compression targets in summarizing
  multiple sources or in providing summaries for handheld devices are
  much smaller. These high reduction rates pose a challenge because
  they are hard to attain without a reasonable amount of background
  knowledge. 

  Another challenge is how to evaluate summarizers. If you are to
  trust that the summary is indeed a reliable substitute for the
  source, you must be confident that it does in fact reflect what is
  relevant in that source. Hence, methods for creating and evaluating
  summaries must complement each other.
  
  \vspace{.2em} \fbox{\parbox{15cm}{10\% for discussing the
      limitations of extractive summarisation, 10\% for discussing the
      output of summarisers and the fact that it is not particularly coherent
      or informative, and 10\% for mentioning problems with respect
      to evaluation.}} \vspace{.2em}
\end{answer}
\end{qupart} 

\begin{qupart}
Describe Kupiec et al.'s (1995) approach to document
summarisation. Your answer should discuss: (a) what their training
corpus consists of; some of the features used to detect important
information in the texts; and how their system determines whether a
sentence should be included in the summary or not.
\mypercent{20}

\begin{answer}
The system presented by Kupiec et al (1995) is a modified Naive
  Bayes classifier. Their training corpus consists of document-summary
  pairs from a number of scientific journals. Before training, they
  run a sentence matching algorithm to determine which sentences in the
  document correspond to sentences in the summary. Instead of doing
  sub-sentence information extraction, the system concentrates on
  sentence selection, so sentences in the training data are marked as
  being part of the summary or not. The system examines a
  set of features for each sentence, all of which are discrete: 

  \paragraph{Sentence Length Cut-off Feature:} Short sentences tend to not be
  included in summaries. Sentences longer than a certain threshold
  have a value of true, and the others are false.

  \paragraph{Fixed-Phrase Feature:} Sentences containing or following
  a number of key phrases tend to be included in summaries. Kupiec et
  al. use 26 indicator phrases. Sentences containing those phrases, or
  ones following section heads with key words, have a value of
  true. All other sentences have a value of false.

  \paragraph{Paragraph Feature:} Sentences are categorized as being
  the initial sentence in a paragraph, a medial sentence, or a final
  sentence.

  \paragraph{Thematic Word Feature:} The most frequent content words
  are designated theme words. Sentences are ranked based on the
  frequency of theme words. This feature is binary, depending on
  whether a sentence is among the highest ranked sentences.

  \paragraph{Uppercase Word Feature:} Proper names and acronyms tend
  to be important. This feature is similar to the previous one, with
  sentences ranked according to the frequency of uppercase words that
  appear more than once in the document. 
  
  For each sentence $s$, the system determines the probability that it
  will be included in a summary $S$, given features $F_{1},  \ldots,F_{k}$ using
  a Bayes decision rule: 

\[
P(s \in S|F_1,F_2, \ldots Fk) = P(F_1, F_2 \ldots F_k|s \in S) \times
\frac{P(s \in S)}{P(F_1, F_2, \ldots F_k)}
\]

 Assuming that the features are
  statistically independent, the above becomes: 
   
\[
\frac{P(s \in S)\prod_{j=1}^{k}{P(F_{j}| s \in
    S)}}{\prod_{j=1}^{k}{P(F_{j})}}
\]

Thus, the system is essentially a naive Bayes classifier.  $P(s \in
S)$ is constant, and $P(F_j |s \in S)$ and $P(F_j)$ are estimated from
the training set by counting occurrences.

\vspace{.2em} \fbox{\parbox{15cm}{5\% for giving the details of the
    corpus, 10\% for mentioning some of the features Kupiec et al. are
    using (full credit will be given if 3 features are mentioned), and
    5\% for describing the summary model (full credit will be given if
    the Naive Bayes classifier is described informally).}}
\end{answer}
\end{qupart} 

\begin{qupart}
Write some perl code \ldots
\mypercent{20+30}
\end{qupart} 

\end{question}

\answersonly{\newpage}

\subsection*{Word Sense Disambiguation}

\begin{question}

\begin{qupart}
Explain what is meant by the ``bag of words'' model which is used for
  various text processing tasks.
\mypercent{10}

\begin{answer}
The bag of words model assumes that text can be represented as a
simple list of the words it contains and, possibly, their frequencies
in the text. Information about the relations between these words and
their position in the text is ignored.
\end{answer}
\end{qupart}

\begin{qupart}
Explain how the bag of words model can be used in Information
Retrieval and word sense disambiguation.
\mypercent{15}

\begin{answer}
The standard approach to IR models each document as a bag of words,
and identifies relevant documents by comparing queries to documents
using similarity measures that work over such bag-of-words
representations. 

There are various applications in word sense disambiguation including naive
Bayesian classifiers which treat the context of an ambiguous word as a bag
of words and the Lesk algorithm for disambiguation using dictionary
definition overlap treats each definition as a bag of words.
\end{answer}
\end{qupart} 

\begin{qupart}
What are the similarities between part of speech tagging and word
  sense disambiguation? Why are techniques for part of speech tagging
  not suitable for word sense disambiguation?  \mypercent{15}

\begin{answer}
Both part of speech tagging and WSD aim to add extra information to
each token in a text (or possibly just each content word). Part of
speech tagging adds syntactic information while WSD selects amongst
word meanings.

Part of speech tagging approaches generally examine a narrow context
around the word being annotated, for example the previous one or two
words. WSD requires a wider context.
\end{answer}
\end{qupart}

\begin{qupart}
Describe Lesk's algorithm for word sense disambiguation using
dictionary definition overlap and explain its
limitations. \mypercent{25}
\end{qupart}

\begin{answer}
Lesk's algorithm relies on the textual definitions of words in a
dictionary to disambiguate words. The pairwise combinations of senses
are considered and the number of words they share in common
counted. The pair of senses with the largest number of words in common
are chosen as the senses. 

To work well the algorithm requires the definitions to be pre-processed
in a number of ways. Empty heads and stop words are removed. (Empty
heads are short expressions generally found at the start of dictionary
definitions which indicate the hypernym of the definition, such as "a
type of".) The remaining words are stemmed. 

The limitations are: 

\begin{enumerate}
\item The approach depends on the correct senses sharing words in their
   definitions. This means that the approach depends on the particular
   dictionary used and will prefer senses with longer definitions over
   shorter ones. 
\item The approach may not be tractable for long sentences; the number
   of sense combinations which have to be checked could be prohibitive. 
\end{enumerate}
\end{answer}

\begin{qupart}
Describe two techniques which can be used to annotate text with word
sense information automatically without the need for manual
annotation. These techniques are used to provide data which can be
used to train and/or test a word sense disambiguation system. Describe
the disadvantages of each.\mypercent{15}
\end{qupart} 

\begin{answer}
Two techniques are:
\begin{enumerate}
\item Make use of parallel corpora where the sense is defined as the
   translation of a word into another language. 

\item Use pseudowords to automatically introduce ambiguity into
   text. (Pseudowords are created by choosing two or more words,
   e.g. car and bottle, and replacing each occurrence of each with
   their concatenation, car-bottle. The task of the disambiguation
algorithm is to identify the original word.)
\end{enumerate}

Their disadvantages are:
\begin{enumerate}
\item Parallel text may be hard to obtain. 

\item Sense distinctions in pseudowords may not be appropriate for actual
   applications (e.g. translation). The sense distinctions are
   artificial and so an algorithm may learn to disambiguate between
   contexts in which each of the words is likely to occur rather than
   meanings of the words. 
\end{enumerate}
\end{answer}

\begin{qupart}
What assumption is made by a naive Bayes classifier when it is used for word
sense disambiguation?
\mypercent{10}
\end{qupart}

\begin{answer}
The naive Bayes classifier assumes that the probabilities of the words
in the context of the ambiguous words are conditionally
independent. That is, the presence, or otherwise, of a word in the
context has no influence on other words. 
\end{answer}

\end{question}

\answersonly{\newpage}

\subsection*{Information Retrieval}

\begin{question}
\end{question}

\answersonly{\newpage}

\questionsend

\end{exam}

\end{document}

\begin{qupart}
\mypercent{XX}

\begin{answer}
\end{answer}
\end{qupart}