examsol02-03.tex 25 KB
Newer Older
Loïc Barrault's avatar
Loïc Barrault committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
\documentclass[12pt]{article}
%\documentstyle[12pt,newexam,epsf]{article}
%\documentstyle[myexam]{article}
%\usepackage{graphicx}
\usepackage{newexam+shield}
\usepackage{epsf}
\input{abbrev}
%\input{matdec}




%%%% local macros %%%%

\newcommand{\seq}[2]{\mbox{$#1_{1},\- \ldots ,\-#1_{#2}$}}
\newtheorem{prop}{Proposition}[questionnumber]
\newenvironment{proof}{\begin{trivlist}\item[\hskip \labelsep {\bf Proof}]}{\nopagebreak
    \rule{1mm}{3mm}\end{trivlist}}
\newenvironment{answer}{\bigskip\it}{
    \rm\bigskip}

\newcommand{\es}{\mbox{$\lambda$}}
\newcommand{\mrewrites}{\mbox{$\stackrel{*}{\Rightarrow}$}}
\newcommand{\psrule}{\mbox{$\rightarrow$}}
%\newcommand{\psrule}[2]{\mbox{{\rm #1}\ \ $\rightarrow$\ \ {\rm #2}}}
\newcommand{\synsemrule}[3]{%
    \mbox{{\rm #1}\ \ $\rightarrow$\ \ {\rm #2}\ :\ ${\it #3}$}}

\newcommand{\scbrl}{\left[\hspace*{-0.12em}\left[}
\newcommand{\scbrr}{\right]\hspace*{-0.12em}\right]}
\def\argmax{\mathop{\rm argmax}}
\def\argmin{\mathop{\rm argmin}}
\newcommand{\pgm}[1]{{\sc #1}}
\newcommand{\union}{\cup}
\newcommand{\intersection}{\cap}
\newcommand{\ch}{$\surd$}
\newcommand{\entails}{\vdash}
\newcommand{\tand}{\wedge}
\newcommand{\tor}{\vee}
%%%%%%%%%%%%%%%%%%%%%%

\begin{document}

\bc         
{\bf  Solutions for COM3110 (Text Processing) Exam 2002-03\\
        Setters: R. Gaizauskas (1,2)/L. Guthrie (3,4)}
            
\ec         

{\bf Note: These solutions are more than model solutions; they are
  {\it ideal} solutions in the sense that they contain more, in total,
  than any student could reasonably be expected to produce. Full
  marks for most questions could be obtained by producing less than is
  included here.  Their utility is as an aid in marking, as they
  supply the union of all material the students might be expected to
  present as an answer.}

\begin{question}
Answer {\bf each} of the following short answer questions. 


% Character encoding, compression and text markup
 
 % Character Encoding + Unicode
 
 \begin{qupart}
      Explain each of the following terms and make clear
      the relations between them: {\bf language}, {\bf script},
      {\bf character}, {\bf glyph}, {\bf font}. Give examples of each.
      \mypercent{20}
      
      %\exitem Explain the overall goals and high level design principles
      %underlying Unicode.
      %What distinguishes Unicode from earlier character coding schemes? 
      %\mypercent{20}
  \begin{answer}
      \begin{description}
      \item[language] the fundamental means of communication between
        humans; may be spoken or written -- spoken language both
        historically and cognitively more fundamental than written
        language (e.g English, Chinese, Hindi)
      \item[script] a system for writing down a language -- ``the system
        of graphical marks employed to record expressions of a
        language in visible form''; some languages have more than one
        script (e.g Japanese); some scripts serve for more than one
        language (e.g. Roman)
      \item[character] the smallest unit of written language that has
        a semantic value (e.g. ``a'' in English; a Chinese
        ``logogram''; a constant + vowel in a alphasyllabic script
        like Devenagari);
      \item[glyph] a representation of a character or characters, as
        it/they is/are rendered or displayed

        A given character can be
        rendered in many ways (italic, sanserif) -- i.e. can have many
        corresponding glpyhs); and multiple characters can be
        represented as a single glyph (e.g ligatures; characters +
        diacritics in Devenagari)
      \item[font] a repertoire of glyphs  (e.g. 10 pt times roman)
      \end{description}
      \rm

      {\bf 20 \%: 4 \% per term}
      
\end{answer}    
 \end{qupart}

 \begin{qupart}
   Briefly describe the function of, and give an example of, each of the
   \verb+<!ELEMENT>+, \verb+<!ATTLIST>+ and \verb+<!ENTITY>+
   declarations as found in SGML document type definitions.
   \mypercent{20}
 \end{qupart}

\begin{answer}
  
  For a document with DTD $D$ 
  \begin{enumerate}
  \item  \verb+<!ELEMENT>+ declarations specify
  the type and structure of the textual units or elements that will be
  found in an SGML document of type $D$.  They specify
    \begin{itemize}
    \item the name of the element
    %\item whether starting and ending tags are compulsory (-) or not
    %  (0)
    \item the content model of the element -- the type and order of
      sub-elements that may or must occur within this element.
      Child elements are either:
        other named SGML elements,
        {\tt  \#PCDATA} (parsed character data),
        {\tt  \#NDATA} (binary data),
        {\tt EMPTY}.
      %\item the sequence of elements is specified by writing their
      %  type specifiers in a comma-separated sequence; alternatives
      %  may be specified using {\tt |}
      %\item regular expression quantifiers are used to specify number
      %  of occurrences -- no quantifier means exactly one, {\tt
      %    ?} means 0 or 1, {\tt \*} means 0 or more, and {\tt +}
      %  one or more.

    \end{itemize}
    

    E.g. An email element might be defined as:\\
    \verb#<!ELEMENT email - - (header, contents) >#\\
    which asserts that an email element is called {\tt email}, has
    compulsory opening and closing tags, and consists of a header
    element followed by a contents element (each of which may in turn
    be composed of sub-elements).
      %sender element, one or more
      %address elements, 0 or 1 subject elements and 0 or more Cc elements.

 \item  \verb+<!ATTLIST>+ declarations specify
  \begin{itemize}
  \item the attributes associated with each element
  \item for each attribute:
    the type of the attribute 
    a default declaration for the attribute -- indicates whether
      a value is optional ({\tt \#IMPLIED}), required ({\tt
        \#REQUIRED}), or if there is an explicit default
  \end{itemize}

  For exanmple, the email element might have as associated attributes
  a required {\tt date\_sent} attribute and an {\tt status} attribute
  with default value {\tt public}:
  \begin{verbatim}
   <!ATTLIST e-mail
          date_sent    DATE               #REQUIRED
          status       (secret | public)  public >
  \end{verbatim}

  \item \verb+<!ENTITY>+ declarations provide a general ``macro''
    definition facility which may be used in both DTD and document
    instances
    \begin{itemize}
    \item SGML has a number of built-in entities. For example:
       \verb+&lt+  escapes \verb+<+ when \verb+<+
        is to occur in character data (and is not signalling 
        the beginning of an SGML tag); \verb+&amp+  escapes \verb+&+.
    \item Entities may also be used to define substitute strings
      as abbreviations. E.g:
\begin{verbatim}
<!ENTITY fname SYSTEM "/home/robertg/docs/foo.dtd">
\end{verbatim}
     allows \verb+&fname+ to be used in place of the path name
     in the document instance
    \end{itemize}
  \end{enumerate}

{\bf 8 \% for Elements, and 7 \% each for Attributes and Entities. For
each 5\% for the description and 2/3\% for the example.}
\end{answer}

 \begin{qupart}
   Briefly describe the Unicode coding model, making clear the levels in the
   model, explaining the differences between, and the motivations for,
   UTF-8 and UTF-16
   %, and describing the purpose and implementation of
   %surrogate pairs.  
 \mypercent{20}

\begin{answer}

       The Unicode model may be first approximated by 
  a three level model consisting of:
  \begin{enumerate}
  \item an abstract character repertoire %(ACR)
  \item their mapping to a set of integers %(coded character set -- CCS)
  \item their encoding forms + byte serialization
  \end{enumerate}


  Expanding:
  \begin{enumerate}
  \item Abstract character repertoires are being established for all the
  world's languages -- involves specifying what the set of characters
  in each language are.

  \item Each of these character repertoires is mapped onto integers in the
  range 0 - 2$^{16}$. Of these 65,536 values 
    \begin{itemize}
    \item 63,486 are available to represent characters with single
      16-bit code values
    \item 2048 code values are available to represent an additional
      1,048, 544 characters through paired 16-bit code values
      (called {\bf surrogate pairs})
    \end{itemize}
    
  \item Once a mapping from an abstract character set to a set of
    integers has been defined two further mappings need to be
    specified
  \begin{itemize}
  \item {\bf Character Encoding Form} -- a mapping from a set of
    integers to a set of sequences of code units of specified width
    (e.g. 8-bit bytes).
  \item {\bf Character Encoding Scheme} -- a mapping from a set of
    sequences of code units to a {\bf serialized} sequence of bytes
    (addresses issues of byte ended-ness on different machines)
  \end{itemize}

    \noindent
    The most common character encoding forms are:

    \begin{itemize}

    \item UTF-16 (Unicode (or UCS) Transformation Format-16)
    -- default Unicode encoding form -- characters are assigned to two
      eight bit bytes, except for surrogate pairs which consist of two
      16-bit values (4 bytes)
    
      advantages:
      \begin{itemize}
      \item the most simple and straightforward way to map Unicode
        code points into code units (bytes)
      \end{itemize}
     
      disadvantages:
      \begin{itemize}
      \item files containing only Latin texts are twice as large as 
        they are in single byte encodings, such as ASCII or ISO
        Latin-1
      \item not backwards/forwards compatible with ASCII -- so 
        programs that expect a single-byte character set won't
        work on a UTF-16 file {\it even if it only contains Latin text}.
      \end{itemize}


    \item UTF-8 (Unicode (or UCS) Transformation Format-8) --
      maps a Unicode scalar value onto 1 to 4 bytes 
      \begin{itemize}

      \item 1st byte indicates number of bytes to follow
      \item one byte sufficient for ASCII code values (1..127)
      \item two bytes sufficient for most non-ideographic scripts 
      \item four bytes needed only for surrogate pairs
      \end{itemize}
    
      advantages:
      \begin{itemize}
      \item existing ASCII files are already UTF-8
      \item most broadly supported encoding form today
      \end{itemize}

    disadvantages:
      \begin{itemize}
      \item ideographic (mostly Asian) languages requires 3
        bytes/character -- so UTF-8 encodings are larger for
        Asian languages than UTF-16 and most existing encodings 
      \end{itemize}

      \end{itemize}
%    \noindent
%    The most common character encoding schemes are:
%    \begin{itemize}
%
%      \item UTF-16BE -- UTF-16 with big endian byte sequencing
%      \item UTF-16LE -- UTF-16 with little endian byte sequencing
%
%    \end{itemize}



  \end{enumerate}
\bigskip

      {\bf 20\%:  10\% for a clear picture of the levels of the model;
        10 \% for UTF-8/16 distinction}

\end{answer}




 \end{qupart}

 \begin{qupart}
   Define each of the following terms as they are used in discussing
   Perl regular expressions and give an example of each:
   {\bf metacharacters}, {\bf metasymbols}, {\bf anchors}, {\bf
   quantifiers}, {\bf back references}.
 \mypercent{20}
 \end{qupart}

 \begin{answer}
   {\bf Metacharacters} are single characters which do not match themselves
   in regular expressions, but instead have special meanings. There
   are 12 of them:
\begin{verbatim}
    \ | ( ) [ { ^ $ * + ? .
\end{verbatim}
        
   {\bf Metasymbols} are sequences of two or more characters with special
   meaning in regexs, the first character of which is always a
   backslash.
  
   Most metasymbols fall into two categories
  \begin{itemize}
  \item {\it special characters}: certain characters in a regex that
    cannot easily be typed at the keyboard -- e.g. \verb+\n+ --
    Newline
  \item {\it character class shortcuts}: certain character classes are
    used so frequently abbreviations have been created for them --
    e.g.  \verb+\d+ (digit character) abbreviates \verb+[0-9]+
  \end{itemize}

   {\bf Anchors} 

   The default behaviour of the Perl RE matcher is to attempt to
  match the pattern against the string, starting at the beginning
  of the string, then ``floating'' down it to find a match.
  To force the pattern to match at particular points in the
  string {\it anchors} can be included in the regex.
  \begin{itemize}
  \item The beginning of a string is matched using the \verb+^+ 
  character. E.g. 
  \verb+/bert/+ matches {\tt bert} and {\tt robert};
  \verb+/^bert/+ matches {\tt bert}  but not {\tt robert}.
  \item The end of a string is matched using the \verb+$+ 
    character.
  \item \verb+\b+ (word-boundary anchor) anchors matching at the
  beginning or end of ``words'' (strings matched by \verb?\w+?).
    \verb+\B+  matches non-word boundaries -- everywhere  \verb+\b+
    does not.
  \end{itemize}

   {\bf Quantifiers} 

   Quantifiers specify how many times a specific character pattern  in
   an RE is to match the input. There are three basic
  quantifiers in Perl regexs:
  \begin{itemize}
  \item \verb+*+ : match the preceding item 0 or more times
  \item \verb?+? : match the preceding item 1 or more times
  \item \verb+?+ : match the preceding item 0 or 1 times
  \end{itemize}
  Precise numbers of matches can be specified using numeric
  quantifiers in curly braces.

  Items to which quantifiers attach include characters, character
  classes and groupings of characters in parentheses (``()'')

  Examples
  \begin{itemize}
  \item \verb?/:\w+:/? matches one or more ``word'' characters between
    {\tt :}'s
%   \item \verb?/foo+/? matches {\tt foo}, {\tt fooo}, {\tt foooo}, etc.
   \item \verb?/(foo)+/? matches {\tt foo}, {\tt foofoo}, {\tt foofoofoo}, etc.
   \item  \verb?/\d{5,10}/? matches from 5 to 10 digits
  \end{itemize}

   {\bf Back References} 
   
   {\it Capturing} provides the capability to ``remember'' a substring
   matched by part of a pattern, and use that substring later on in
   the pattern itself via a back reference.  Capturing is done using
   parentheses (``()'') -- any substring of the target matched by a
   pattern segment in parentheses is remembered by the matcher
    Back referencing is done using a backslash followed by an
    integer identifying which captured string is referred to --
    the first captured substring is back referenced as \verb+\1+,
    the second by \verb+\2+, etc.

    For example \verb+/\B([a-z])\1(ing|ed)\b/+ would find all words ending 
  in a double-letter followed by {\tt ing} or {\tt ed}.

  {\bf 20\%: 4\% each  -- 3\% for definition, 1\% for example}
      \end{answer}

 \begin{qupart}
   What would be an appropriate Perl data structure to hold an email
   folder, where an email folder is viewed as a numbered sequence of
   messages each of which consists of a collection of fields, such as
   \verb+To+, \verb+From+, \verb+Subject+, etc.? 
%    %Give the Perl code to
%    %access a message field, given the message number, the message field
%    %name and the data structure holding the email folder. 
%    Give the Perl
%    %code to sort and print the folder alphabetically by \verb+From+
%    code to print one line for each email message in the folder
%    containing the \verb+From+ and \verb+Subject+ fields for the
%    message. The whole list should be sorted alphabetically by
%    \verb+From+ field, and within  \verb+From+ field by message number.

   Write a Perl subroutine which, given a sender's name and a reference
   to the email folder data structure, will print out the subject field
   of all messages from the sender, one per line.
   \mypercent{20}

\begin{answer}
  
  An appropriate data structure would be an array of hash references,
  where each entry in the array points to an email message hash, and each hash has
  as keys the field names and as values the contents of the field.
  
  %The following Perl code sorts and prints the folder alphabetically.
  %Assume the folder is held in an array of hash references called
  %\verb+@email_folder+

%   %senders = (); 
%   $index = 1;
%   for ($email in @email_folder) {
%     $senders{$index} = $$email{``From''};
%     $$index++;
%   }
%   for ($i=0;$i <= $#@senders;$i++) {    
%   }

\begin{verbatim}
sub print_sender_messages {
  my ($sender,$folder_ref) = @_;
  my ($subject);
  for ($email_ref in @$folder_ref) {
      if ($$email_ref{"From"} eq $sender) {
        $subject = ($$email_ref{"Subject"};
        print "$subject\n";
  }
}
\end{verbatim}
{\bf 5 \% for the data structure; 15 \% for the code: 
  5 \% for the subroutine, parameter passing; 
  5 \% for using references correctly; 5 \% for appropriate
  looping constructs}
\end{answer}

 \end{qupart}


\end{question}

%\newpage

\begin{question}
 % Text Compression
 \begin{qupart}
   Text compression techniques are important because growth in volume
   of text continually threatens to outstrip increases in storage,
   bandwidth and processing capacity. Briefly explain the differences between:
    \begin{exlist}
      \exitem {\bf symbolwise} (or
      statistical) and {\bf dictionary} text compression methods;
      \mypercent{10} 

\begin{answer}
  \begin{itemize}
  \item {\bf Symbolwise methods} work by estimating the probabilities
    of symbols (characters or words/non-words) and coding one symbol 
    at a time using shorter codewords for the more likely symbols

  \item {\bf Dictionary methods} work by replacing word/text fragments 
    with an index to an entry in a dictionary
  \end{itemize}

  {\bf 5\% for explanation of symbolwise methods, 5 \% for dictionary}
\end{answer}


      \exitem {\bf modelling} versus {\bf coding} steps 
      \mypercent{10}

\begin{answer}
 Symbolwise methods rely on a modeling step and a coding step
  \begin{itemize}
  \item {\bf Modeling} is the estimation of probabilities for the 
    symbols in the text -- the better the probability estimates, the
    higher the compression that can be achieved

\medskip
  \item {\bf Coding} is the conversion of the probabilities obtained 
    from a model into a bitstream 
  \end{itemize}
  {\bf 5\% for explanation of modelling step, 5 \% for coding step}
\end{answer}

      \exitem  {\bf static}, {\bf semi-static} and {\bf adaptive}
      techniques for text compression;
      \mypercent{10}

\begin{answer}
Compression techniques can also be distinguished by 
  whether they are 
  \begin{itemize}
  \item {\bf Static} -- use a fixed model or fixed dictionary derived
    in advance of any text to be compressed
\medskip
  \item  {\bf Semi-static} -- use current text to build a 
    model or dictionary during one pass, then apply it in second pass
\medskip
  \item {\bf Adaptive} -- build model or dictionary adaptively during
    one pass
  \end{itemize}
{\bf 3\% each, 1 \% discretionary}
\end{answer}


\newpage



%      \exitem {\bf Huffman coding} and {\bf arithmetic coding} methods
%      for text compression.
%      \mypercent{10} 
   \end{exlist}
 \end{qupart}


\begin{qupart}
  The script for the fictitious language Gavagese contains only the
  7 characters {\it a}, {\it e}, {\it u}, {\it k}, {\it r}, {\it f}, {\it
    d}. You assemble a large electronic corpus of Gavagese and now
  want to compress it. You analyse the frequency of occurrence of each
  of these characters in the corpus and, using these frequencies as
  estimates of the probability of occurrence of the characters in the
  language as a whole, produce the following table:

\bigskip
\begin{center}
  \begin{tabular}{ll}
    Symbol & Probability \\ \hline
    a & 0.25\\
    e & 0.20 \\
    u & 0.30 \\
    k & 0.05 \\
    r & 0.07\\
    f & 0.08\\
    d & 0.05 \\
\end{tabular}
\end{center}

\begin{exlist}
\exitem Show how to construct a Huffman code tree for Gavagese,
given the above probabilities.
\mypercent{30}

\begin{answer}
  Start off by creating a leaf node for each character, with
  associated probability (a). Then join two nodes with smallest
  probabilities under a single parent node, whose probability is their
  sum, and repeat till only one node left. Finally, 0's and 1's are
  assigned to each binary split.

\begin{figure*}[!tbh]
\centerline{\includegraphics[width=12cm]{./examsol02-03-fig1.eps}}
\end{figure*}

{\bf 10 \% for the explanation of the method; 20 \% for a correct tree}
\end{answer}

%\newpage

\exitem Use your codetree to encode the string {\it dukerafua} and
show the resulting binary encoding.  For this string, how much does
length does your codetree encoding save over a minimal fixed length
binary character encoding for a 7 character alphabet? 
\mypercent{10}

\begin{answer}
Encoding for {\it dukerafua} will be 

\begin{verbatim}
d    u  k    e  r    a  f    u  a 
1101 10 1100 01 1111 00 1110 10 00
\end{verbatim}

For a seven letter alphabet a minimal fixed length binary character
encoding is 3 bits per character. There are 9 characters in the
strength, so a fixed length encoding would require 27 characters. The
codetree encoding is 26, so one character only is saved (the
advantages will become apparent over larger more statistically
representative strings).

{\bf 5 \% for the encoding; 5 \% for getting the amount saved correct}
\end{answer}

\end{exlist}

\end{qupart}

\begin{qupart}
  One popular compression technique is the LZ77 method, used in common
  compression utilities such as {\it gzip}.
\begin{exlist}
\exitem Explain the key idea underlying LZ77, issues that must be
addressed in implementing it, and how {\it gzip} has addressed some
of these issues.
\mypercent{20}

\begin{answer}
  The {\bf key idea} underlying the LZ77 adaptive dictionary compression method
  is to replace substrings with a pointer
  to previous occurrences of the same substrings in same text.  The
  encoder output is a series of triples where
  \begin{itemize}
  \item the first component indicates how far back in decoded output to
    look for next phrase
  \item the second indicates the length of that phrase
  \item the third is next character from input (only necessary when 
    not found in previous text, but included for simplicity)
  \end{itemize}
  
  {\bf Issues} to be addressed in implementing an adaptive dictionary method
  such as LZ77 include
  \begin{itemize}
  \item how far back in the text to allow pointers to refer
    \begin{itemize}
    \item references further back increase chance of longer matching
      strings, but also increase bits required to store
      pointer
    \item typical value is a few thousand characters
    \end{itemize}
  \item how large the strings referred to can be
    \begin{itemize}
    \item the larger the string, the larger the width parameter
      specifying it
    \item typical value $\sim$ 16 characters
    \end{itemize}
  \item during encoding, how to search window of prior text for
    longest match with the upcoming phrase
    \begin{itemize}
    \item linear search very inefficient
    \item best to index prior text with a suitable data structure,
      such as a trie, hash, or binary search tree
    \end{itemize}
  \end{itemize}
\item A popular high performance implementation of LZ77 is
  {\bf gzip}
  \begin{itemize}
  \item uses a hash table to locate previous occurrences of strings
    \begin{itemize}
    \item hash accessed by next 3 characters
    \item holds pointers to prior locations 
      of the 3 characters
    \end{itemize}
  \item pointers and phrase lengths are stored using variable length
    Huffman codes, computed semi-statically by processing 64K blocks
    of data at a time %(can be held in memory, so appears as if
    %single-pass)
  \item pointer triples are reduced to pairs, by eliminating
    3rd element
    \begin{itemize}
    \item  first transmit phrase length --
    if 1 treat pointer as raw character;
    else treat pointer as genuine pointer
    \end{itemize}
  \end{itemize}

{\bf 10 \% for the key idea; 5 \% for issues and 5 \% for gzip}

\end{answer}


\exitem How would the following LZ77 encoder output

% a b ad badb bbba

%badbadbbbbaaaaddddabba


    \[ 
    \langle 0, 0, b \rangle  \langle 0, 0, a \rangle  
    \langle 0, 0, d \rangle  
    \langle 3, 3, b \rangle   \langle 1, 3, a\rangle 
    \langle 1, 3, d \rangle   \langle 1, 3, a \rangle  
    \langle 11, 2, a \rangle  
    \]
 
be decoded, assuming the encoding representation presented in the
lectures?
\mypercent{10}

\begin{answer}

  \begin{enumerate}
  \item $\langle 0, 0, b \rangle$ Go back 0 copy for length 0 and end with
    $b$: $b$
  \item $\langle 0, 0, a \rangle$ Go back 0 copy for length 0 and end with
    $a$: $ba$
  \item $\langle 0, 0, d \rangle$ Go back 0 copy for  length 0 and end with
    $a$: $bad$
  \item $\langle 3, 3, b \rangle$  Go back 3 copy for  length 3 and end with
    $b$: $badbadb$
  \item $\langle 1, 3, a \rangle$  Go back 1 copy for  length 3 and end with
    $a$: $badbadbbbba$
  \item $\langle 1, 3, d \rangle$  Go back 1 copy for  length 3 and end with
    $d$: $badbadbbbbaaaad$
  \item $\langle 1, 3, a \rangle$  Go back 1 copy for  length 3 and end with
    $a$: $badbadbbbbaaaadddda$
  \item $\langle 11, 2, a \rangle$ Go back 11 copy for  length 2 and end with
    $a$: $badbadbbbbaaaaddddabba$
  \end{enumerate}

Thus, the decoded string is:

\begin{center}
{\tt badbadbbbbaaaaddddabba }
\end{center}

{\bf 5 \% for the correct answer; 5 \% for showing how it is derived.}
\end{answer}

\end{exlist}

\end{qupart}

\end{question}

\newpage


\end{document}