ie_ner.tex 28.4 KB
Newer Older
Loïc Barrault's avatar
Loïc Barrault committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
% !TEX root = text_processing.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{}

\vfill
\centering
\Huge{\edinred{[Information Extraction]\\Named Entity Recognition}}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Overview}

\begin{itemize}
\item \gray{Introduction to Information Extraction}
\begin{itemize}
	\item \gray{Definition + contrast with IR}
	\item \gray{Example Applications}
	\item \gray{Overview of Tasks}
	\item \gray{Overview of Approaches}
	\item \gray{Evaluation + Shared Task Challenges}
	\item \gray{Brief(est) history of IE}
\end{itemize}

\item \textbf{Named Entity Recognition}
\begin{itemize}
	\item \textbf{Task}
	\item \textbf{Approaches: Rule-based, Supervised Learning}
	\item \textbf{Entity Linking}
\end{itemize}

\item Relation Extraction
\begin{itemize}
	\item Task
	\item Approaches: Rule-based, Supervised Learning, Bootstrapping, Distant Supervision
\end{itemize}
\end{itemize}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER - recap}

\begin{block}{\textbf{Entity Extraction/Named Entity Recognition (NER)}}
Task: Identify the \myemph{extent} and the \myemph{type} of each textual mention of an entity\\
The set of types is determined in advance (e.g. organisation, person, date, etc...)
\end{block}

\begin{center}
\begin{tabular}{ll}
\myhl{cyan!40}{Cable and Wireless} today announced \ldots & Extent: 0-3 ; Type = \myhl{cyan!40}{ORG} \\
\myhl{cyan!40}{IBM} and \myhl{cyan!40}{Microsoft} today announced  \ldots  & Extent: 0-1 ; Type = \myhl{cyan!40}{ORG} \\
				 							& Extent: 2-3 ; Type = \myhl{cyan!40}{ORG} \\
\myhl{brown!90}{John Lewis} hired  \ldots 				& Extent: 0-2 ; Type = \myhl{cyan!40}{ORG} \\
\myhl{brown!90}{Theresa May} hired. 				& Extent: 0-2 ;  Type = \myhl{brown!90}{PER}

\end{tabular}
\end{center}


\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER - recap}

\textbf{Types of entities  addressed by IE systems include:}\\
\begin{itemize}
\item \textbf{Named individuals}
\begin{itemize}
\item Organisations (ORG), persons (PER), books, films, ships, restaurants . . .
\item[\ra] \myhl{cyan!40}{Cable and Wireless} today announced \ldots ; Extent: \textbf{0-3} ; Type = \textbf{ORG} \\
\item[\ra] \myhl{brown!90}{Barack Obama} was the 44th president...  \ldots ; Extent: \textbf{0-3} ; Type = \textbf{PER} \\

\item Geo-Political entities (GPE), locations (LOC)
\item[\ra] The \myhl{carminered}{Mont Blanc} intersects France, Italy and Switzerland. ; Extent: \textbf{1-3} ; Type = \textbf{LOC} \\
\item[\ra] The Mont Blanc intersects \myhl{carminered!60}{France}, \myhl{carminered!60}{Italy} and \myhl{carminered!60}{Switzerland}. ; Extent: \textbf{4-5} ; Type = \textbf{GPE} \\

\end{itemize}

%\item Named kinds
%\begin{itemize}
%\item Proteins, chemical compounds/drugs, diseases, aircraft components . . .
%\end{itemize}
\item \textbf{Times}: temporal expressions dates, times of day
\begin{itemize}
\item[\ra] Let's meet at \myhl{orange}{2pm} next Friday  \ldots ; Extent: \textbf{3-4} ; Type = \textbf{TIME} \\
\item[\ra] Let's meet at 2pm next \myhl{orange!50}{Friday}  \ldots ; Extent: \textbf{5-6} ; Type = \textbf{DATE} \\
\end{itemize}

\item \textbf{Measures}: monetary expressions, distances/sizes, weights . . .
\begin{itemize}
\item[\ra] This watch costs \myhl{bananayellow}{£35}  \ldots ; Extent: \textbf{3-4} ; Type = \textbf{MONEY} \\
\end{itemize}

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: coreference - recap}

\begin{block}{\textbf{Coreference}}
Different textual expressions that refer to the same real world entity are said to \myemph{corefer}.
\textbf{Coreference Task}: link together all textual references to the same \myemph{real world entity}, 
\end{block}

Multiple references to the same entity in a text are rarely made using the same string:
\begin{itemize}
\item Pronouns: \textbf{Tony Blair} == \textbf{he}
\item Names/definite descriptions: \textbf{Tony Blair} == \textbf{the Prime Minister}
\item Abbreviated forms: \textbf{Theresa May} == \textbf{May}; \textbf{European Union} == \textbf{EU}
\item Orthographic variants: \textbf{alpha helix}  ==  \textbf{alpha-helix} == \textbf{$\bm{\alpha}$-helix} == \textbf{a-helix}
\end{itemize}

\vfill
Can be seen as a separate task or as part of entity extraction task
\vfill
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Overview}

\begin{itemize}
\item \gray{Introduction to Information Extraction}
\begin{itemize}
	\item \gray{Definition + contrast with IR}
	\item \gray{Example Applications}
	\item \gray{Overview of Tasks}
	\item \gray{Overview of Approaches}
	\item \gray{Evaluation + Shared Task Challenges}
	\item \gray{Brief(est) history of IE}
\end{itemize}

\item \textbf{Named Entity Recognition}
\begin{itemize}
	\item \gray{Task}
	\item \textbf{Approaches: Rule-based, Supervised Learning}
	\item Entity Linking
\end{itemize}

\item Relation Extraction
\begin{itemize}
	\item Task
	\item Approaches: Rule-based, Supervised Learning, Bootstrapping, Distant Supervision
\end{itemize}
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Approaches to NER}

\textbf{Knowledge-engineering}
\begin{itemize}
\item leverage linguistic resources created by experts
\end{itemize}

\textbf{Supervised learning}
\begin{itemize}
\item use of machine learning techniques
\end{itemize}

\mycolor{lightgray}{\textbf{Bootstrapping}}<2->
\begin{itemize}
\item \mycolor{lightgray}{Use of \textbf{seed patterns} to identify named entities}<2->
\item \mycolor{lightgray}{Use known named entities to generate new patterns}<2->
\item \mycolor{lightgray}{Rinse, repeat}<2->
\end{itemize}
%https://arxiv.org/ftp/arxiv/papers/1511/1511.06833.pdf

\mycolor{lightgray}{\textbf{Distant supervision / lightly supervised methods}}<2->
\begin{itemize}
\item \mycolor{lightgray}{$\sim$ bootstrapping a machine learning system}<2->
\end{itemize}

%https://www.aclweb.org/anthology/C18-1183/

\vfill
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Knowledge Engineering Approaches}

Dominant approach in the 1990s and still in use in many IE systems today.

Such systems typically use
\begin{itemize}
\item named entity lexicons and
\item manually authored pattern/action rules or regular expression/FST recognisers
\end{itemize}

Example: NER system, developed for participation in MUC-6
\begin{itemize}
\item described in Wakao et al. (1996) \cite{wakao-etal-1996-evaluation}
\item[\ra] recognizes \myemph{organisation}, \myemph{person}, \myemph{location} and \myemph{time} expressions in \textbf{newswire texts}
\end{itemize}

System has three main stages:
\begin{enumerate}
\item Lexical processing
\item NE parsing
\item Discourse interpretation - Coreference Resolution
\item Discourse interpretation - Semantic Type Resolution
\end{enumerate}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Knowledge Engineering Approaches}

\begin{enumerate}
\item Lexical processing
\end{enumerate}

Rule-based NER systems use \textbf{specialized lexicons} \ra\ \myemph{gazetteers} (= geographical directory)

The Wakao et al. system has specialised lexicons for:
\vspace{-.5cm}
\begin{columns}
\begin{column}{.37\textwidth}
\begin{itemize}
\item \myemph{Organisations}\textbf{2600} entries
\item \myemph{Locations}\textbf{2200} entries
\item \myemph{Person names}\textbf{500} entries
\end{itemize}

\end{column}
\begin{column}{.63\textwidth}

\begin{itemize}
\item \myemph{Company designators}: e.g. Corp, Ltd – \textbf{94} entries
\item \myemph{Person titles}: e.g. Mr, Dr, Reverend – \textbf{160} entries
\end{itemize}
\end{column}
\end{columns}
\vfill 
\only<2->{
Why not use even larger gazetteers?
\begin{itemize}
\item Gazetteer of British Place Names containing over 50,000 entries
\item[\ra] Many NEs occur in multiple categories
\item[\ra] \textbf{The larger the lexicons the greater the ambiguity}
\item Ex.: Ford \Ra\ \myemph{company} or \myemph{person} or \myemph{place}
\item listing of names is never complete \ra\ need a mechanism to type unseen NEs!
\end{itemize}
}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Knowledge Engineering Approaches}

\begin{enumerate}
\item Lexical processing
\end{enumerate}

Example sentence: "Norwich Investment Bank plc. today announced ..."

\begin{enumerate}
\renewcommand{\theenumi}{1\alph{enumi}}
\item<2-> Tokenisation, sentence splitting, morphological analysis, Part-Of-Speech tagging
\item[\ra]<2-> \scriptsize{\annot{Norwich}{NNP} \annot{Investment}{NNP} \annot{Bank}{NNP} \annot{plc.}{NN} \annot{today}{RB} \annot{announced}{VBD} ...}

\vfill
\item<3-> Gazetteer Lookup and Tagging: 
\begin{itemize}
	\item \textbf{ORG}ganisations, \textbf{LOC}ations, \textbf{PER}sons, company designators (\textbf{CDG}), person titles
\end{itemize}
\item[\ra]<3-> \scriptsize{\annot{Norwich}{NNP/\textbf{LOC}} \annot{Investment}{NNP} \annot{Bank}{NNP} \annot{plc.}{NN/\textbf{CDG}} \annot{today}{RB} \annot{announced}{VBD} ...}

\vfill
\item<4-> Trigger Word Tagging
\begin{itemize}
	\item \textbf{trigger words} allow to classify certain multi-word names
	\item[\ra] Ex.: \myemph{Airlines} in "\textbf{Wing and Prayer} \myemph{Airlines}"
	\item system has trigger words for \textbf{ORG}anisations, \textbf{GOV}ernment institutions, \textbf{LOC}ations
\end{itemize}
\item[\ra]<4->  \scriptsize{\annot{Norwich}{NNP/\textbf{LOC}} \annot{Investment}{NNP} \annot{Bank}{NNP/\red{\bf ORG-TRIGGER}} \annot{plc.}{NN/\textbf{CDG}} \annot{today}{RB} \annot{announced}{VBD} ...}

\end{enumerate}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Knowledge Engineering Approaches}

\begin{enumerate}
\setcounter{enumi}{1}
\item NE parsing
\end{enumerate}

Hand-produced rules:\\
\vspace{-.5cm}
\begin{columns}
\begin{column}{.5\textwidth}
\begin{itemize}
\item[]
\begin{itemize}
	\item 177  for proper names
	\item 94 for organisation
	\item 54 for person 
\end{itemize}
\end{itemize}

\end{column}
\begin{column}{.5\textwidth}

\begin{itemize}
\item 11 for location
\item18 for time expressions.
\end{itemize}

\end{column}
\end{columns}

\vfill

A fragment of the proper name grammar:
\vspace{-.5cm}
\begin{columns}
\begin{column}{.4\textwidth}
\scriptsize{\begin{itemize}
\item NP --> ORGAN\_NP
\item NAMES\_NP --> NNP NAMES\_NP
\item NAMES\_NP --> NNP
\end{itemize}}

\end{column}
\begin{column}{.6\textwidth}

\scriptsize{\begin{itemize}
\item ORGAN\_NP --> LIST\_LOC\_NP NAMES\_NP CDG\_NP
\item ORGAN\_NP --> LIST\_ORGAN\_NP NAMES\_NP CDG\_NP
\item ORGAN\_NP --> NAMES\_NP '\&' NAMES\_NP
\end{itemize}}
\end{column}
\end{columns}

\vfill

Rule {\scriptsize "ORGAN\_NP --> NAMES\_NP '\&' NAMES\_NP"} means:\\
an unclassified \myemph{proper name} (NAMES\_NP) followed by '\&' followed by an unclassified \myemph{proper name} is an \myemph{organisation name}\\
\ra\ \textbf{Marks \& Spencer} or \textbf{American Telephone \& Telegraph}


\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Knowledge Engineering Approaches}

\begin{enumerate}
\setcounter{enumi}{2}
\item Discourse interpretation - \textbf{Coreference Resolution}
\end{enumerate}

\begin{enumerate}
\renewcommand{\theenumi}{3\alph{enumi}}
\item When the name class of an \textbf{antecedent} (resp. \textbf{postcedent}) is known then establishing coreference allows the name class of the \textbf{anaphor} (resp. \textbf{cataphor}) to be established.
\end{enumerate}

\begin{block}{Anaphora/cataphora}
In a narrower sense, \myemph{anaphora} is the use of an expression that depends specifically upon an \textbf{antecedent} expression and thus is contrasted with \myemph{cataphora}, which is the use of an expression that depends upon a \textbf{postcedent} expression. \source{Wikipedia}
\end{block}

\begin{itemize}
\item Ex1.: \myemph{Ford Motor Co.} was founded in Detroit in 1903. \myemph{It} was the first to introduce...
\end{itemize}


\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Knowledge Engineering Approaches}

\begin{enumerate}
\setcounter{enumi}{2}
\item Discourse interpretation - \textbf{Coreference Resolution}
\end{enumerate}

\begin{enumerate}
\setcounter{enumi}{1}
\renewcommand{\theenumi}{3\alph{enumi}}
\item An unclassified PN may be co-referential with a variant form of a classified PN, e.g.:
\end{enumerate}

\begin{itemize}
\item Ex2.:
\begin{itemize}
	\item \myemph{Ford Motor Co.} was founded in Detroit in 1903, ..., \myemph{Ford} was the first to introduce...
	\item \myemph{Creative Artists Agency} is a US talent agency ...  In 2016, \myemph{CAA} had 1,800 employees
	
\end{itemize}
\item[\ra] The unclassified PN may be inferred to have the same class as the classified PN.\\
	\item[\ra] Wakao et al. use 45 heuristics of this type for organisation, location, and person names.
\end{itemize}

\vspace{.3cm}
\only<2->{
\begin{enumerate}
\setcounter{enumi}{2}
\renewcommand{\theenumi}{3\alph{enumi}}
\item  An unclassified PN may be co-referential with a definite NP
\end{enumerate}
\begin{itemize}
\item Ex3.: \myemph{Kellogg}, the breakfast cereal \myemph{\underline{manufacturer}}
\end{itemize}
}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Knowledge Engineering Approaches}

\begin{enumerate}
\setcounter{enumi}{3}
\item Discourse interpretation - \textbf{Semantic Type Inference}
\end{enumerate}

\begin{block}{}
Semantic type information about the arguments in certain \textbf{syntactic relations} is used to make inferences permitting the classification of PNs
\end{block}


\begin{enumerate}
\renewcommand{\theenumi}{4\alph{enumi}}
\item<1-> \myemph{noun-nous qualification}: PN qualifies an organisation-related object \ra\ organisation
\begin{itemize}
\item[\ra] Erickson \textbf{\underline{stocks}} \Ra\ \textbf{\annot{Erickson}{ORG}}
\end{itemize}

\item<2-> \myemph{possessives}: PN stands in a possessive relation to an organisation post \ra\ organisation
\begin{itemize}
\item[\ra] \textbf{\underline{vice president of}} ABC, ABC\textbf{\underline{’s vice president}} \Ra\ \textbf{\annot{ABC}{ORG}}
\end{itemize}

\item<3-> \myemph{apposition}: PN is apposed with a known organisation post \ra\ person name
\begin{itemize}
\item[\ra] Miodrag Jones, \textbf{\underline{president of XYZ}} \Ra\ \textbf{\annot{Miodrag Jones}{PER}}
\end{itemize}

\item<4-> \myemph{verbal arguments}: PN names an entity involved in a verbal frame where the semantic type of the argument position is known \ra\ classify accordingly
\begin{itemize}
\item[\ra] Smith \textbf{\underline{retired from his position}} as \Ra\ \textbf{\annot{Smith}{PER}}
\end{itemize}

\end{enumerate}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Knowledge Engineering Approaches}

\textbf{Evaluation of Wakao et al.}

MUC-6 NE evaluation set: a \textbf{blind test set} of 30 Wall Street Journal articles containing:
\begin{columns}
\begin{column}{.5\textwidth}
\begin{itemize}
\item 449 organisation names
\item 373 person names
\item 110 location names
\item 111 time expressions
\end{itemize}

\end{column}
\begin{column}{.5\textwidth}

\begin{center}
Results:
\scriptsize{
\begin{center}
\begin{tabular}{llll}
\toprule
Proper Name Class  &  Recall &  Precision & F1\\ \midrule 
Organisation  	& 91 \% & 91 \% & 91.0 \%\\ 
Person 		& 90 \% & 95 \% & 92.4 \% \\ 
Location  		& 88 \% & 89 \% & 88.5 \% \\ 
Time 		& 94 \% & 97 \% & 95.5 \% \\ \midrule 
Overall 		& 91 \% & 93 \% & 92.0 \% \\ \bottomrule
\end{tabular}
\end{center}
}
\end{center}
\end{column}
\end{columns}

\vfill

\Ra\ Best system results on this evaluation had F1 measure = 96.42\%\\
\Ra\ Human results were 96.68\%

\vfill
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Knowledge Engineering Approaches}

\vfill

\textbf{Strengths}
\begin{itemize}
	\item \textbf{High performance} – only several points behind human annotators
	\item \textbf{Transparent} – easy to understand what system is doing/why
\end{itemize}

\vfill

\textbf{Weaknesses}
\begin{itemize}
	\item Porting to another domain requires substantial \textbf{rule re-engineering}
	\item Acquisition of \textbf{domain-specific lexicons}
	\item Rule writing requires high \textbf{levels of expertise}
\end{itemize}
\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Approaches to NER}

\mycolor{lightgray}{\textbf{Knowledge-engineering}}
\begin{itemize}
\item \mycolor{lightgray}{leverage linguistic resources created by experts}
\end{itemize}

\textbf{Supervised learning}
\begin{itemize}
\item use of machine learning techniques
\end{itemize}

\mycolor{lightgray}{\textbf{Bootstrapping}}
\begin{itemize}
\item \mycolor{lightgray}{Use of \textbf{seed patterns} to identify named entities}
\item \mycolor{lightgray}{Use known named entities to generate new patterns}
\item \mycolor{lightgray}{Rinse, repeat}
\end{itemize}
%https://arxiv.org/ftp/arxiv/papers/1511/1511.06833.pdf

\mycolor{lightgray}{\textbf{Distant supervision / lightly supervised methods}}
\begin{itemize}
\item \mycolor{lightgray}{$\sim$ bootstrapping a machine learning system}
\end{itemize}

%https://www.aclweb.org/anthology/C18-1183/

\vfill
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Supervised Learning}

\begin{block}{Aim of Supervised Learning}
Address the \textbf{portability} / \textbf{generalisation} problems inherent in knowledge engineering NER
\end{block}

\begin{itemize}
\item Manually authoring rules \Ra\ systems learn from \textbf{annotated examples}
\item Moving to new domain requires only annotated data in the domain
\item[\ra] supplied by \textbf{domain expert} without need for expert computational linguist
\end{itemize}

\vfill

A wide variety of supervised learning techniques have been tried, including:
\vspace{-.3cm}
\begin{columns}
\begin{column}{.5\textwidth}
\begin{itemize}
	\item Hidden Markov Models (HMM)
	\item Decision Trees
	\item Maximum Entropy models
	\item Support Vector Machines (SVM)
\end{itemize}

\end{column}
\begin{column}{.5\textwidth}

\begin{itemize}
	\item Conditional Random Fields (CRF)
	\item AdaBoost
	\item Deep Learning
\end{itemize}

\end{column}
\end{columns}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Supervised Learning}

Two types of systems that may learn:
\begin{enumerate}
\item \myemphb{patterns} that match extraction targets \ra\ less developed recently
\item \myemphb{classifiers} that label tokens as \myemph{beginning/inside/outside} a \textbf{tag type}
\item[\ra] Systems operate as \textbf{Sequence Labelling} systems
\end{enumerate}

\only<2>{
In \textbf{sequence labelling}, each token is given one of three label types \myemph{B}, \myemph{I} or \myemph{O}:
\begin{itemize}
\item \myemph{B$_{CLASS}$} if the token is at the \myemph{beginning} of a named entity of class = $CLASS$ 
\begin{itemize}
\item where CLASS $\in$ \{\textbf{ORG}, \textbf{PER}, \textbf{LOC}, etc.\}
\end{itemize}

\item \myemph{I$_{CLASS}$} if the token is \myemph{inside} a named entity
\item \myemph{O} if the token is \myemph{outside} any named entity
\end{itemize}

\Ra\ referred to as the \myemph{BIO} scheme
}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Supervised Learning}

\textbf{Example:} \\
\scriptsize{\annot{American Airlines}{ORG}, a unit of \annot{AMR Corp.}{ORG}, immediately matched the move, spokesman \annot{Tim Wagner}{PER} said.}\\
\source{Jurafsky and Martin, 2nd ed., p. 730}

\vspace{1cm}

\only<2->{
In \myemph{BIO} encoding this example looks like this:


\begin{textblock*}{40mm}[0,0](100mm,15mm)

\colorbox{lightgray}{\parbox{3cm}{
\scriptsize{
%\begin{center}
\setlength\extrarowheight{-3pt}
\begin{tabular}{ll}
American 		& \B{ORG} \\
Airlines		& \I{ORG}\\
a			& O\\
unit			& O\\
of			& O\\
AMR			& \B{ORG}\\
Corp.		& \I{ORG}\\
\tikzmark{n2},			& O\\
immediately	& O\\
matched		& O\\
the			& O\\
move		& O\\
,			& O\\
spokesman	& O\\
Tim			& \B{PER}\\
Wagner		& \B{PER}\\
said			& O\\
.			& O
\end{tabular}
%\end{center}
}}}
\end{textblock*}
}

\vfill

\only<3>{
Supervised ML technique:
\begin{itemize}
\item Trained on a corpus of labelled sequences, like this \tikzmark{n1}
\item Aim: predict the labelling of a new, unlabelled example.
\end{itemize}

\begin{tikzpicture}[overlay,remember picture]
    \draw[very thick, -Stealth, carminered]         ($({pic cs:n1})+(0ex,0ex)$)  to [bend left, sloped, ""]  ($({pic cs:n2})+(-5ex,+0ex)$);
\end{tikzpicture}
}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Supervised Learning}

\myemph{Features for Sequence Labelling}

Each token is typically represented as a set of features:
\begin{itemize}
\item the \textbf{token} itself
\item characteristics of the \textbf{token}: stem, Part-Of-Speech, etc. 
\item characteristics of the \textbf{neighbouring tokens} (window of $\pm$ 2 or 3 tokens either side)
%\begin{itemize}
%	\item 
%\end{itemize}
\end{itemize}

Typical features used: \source{Jurafsky and Martin, 2nd ed., p. 731}
\scriptsize{
\begin{center}
\setlength\extrarowheight{-3pt}
\begin{tabular}{lll}
\toprule
\textbf{Features} & \textbf{Explanations} & Ex.: "\textbf{The shop L'Occitane en Provence}" \\ 
\midrule
Lexical items & The token to be labeled & \textbf{L'Occitane}\\
Stemmed lexical items & Stemmed version of the target token & \textbf{Occitan}\\
Shape & Orthographic pattern of the target token & \textbf{X’Xxxxxxxx} or \textbf{X’Xx} \\
Character affixes & Character-level affixes &  \textbf{L}, \textbf{L'}, \textbf{L'O}, \textbf{L'Oc}, \textbf{tane}, \textbf{ane}, \textbf{ne}, \textbf{e}\\
Part of Speech & Part of speech of the token & \textbf{NNP} \\
Syntactic chunk labels & Base-phrase chunk label & \textbf{B-NP}\\
Gazetteer or name list & Token seen in a named entity lists & -\\
Predicted token(s) & Previously predicted classes & \textbf{O O}\\
Bag of words/Bag of n-grams & Tokens and/or n-grams in the context & \textbf{The}, \textbf{shop}, \textbf{The shop}, etc. \\
\bottomrule
\end{tabular}
\end{center}
}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Supervised Learning}

Graphical representation of features used to NER:
\begin{center}
\includegraphics[width=.85\textwidth]{ner_jurafsky}\\
\source{Jurafsky and Martin, 2nd ed., p. 733}
\end{center}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Supervised Learning}

\begin{center}
\only<1>{\includegraphics[width=.85\textwidth]{ner_sequence_labelling_1}}
\only<2>{\includegraphics[width=.85\textwidth]{ner_sequence_labelling_2}}
\end{center}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Supervised Learning}

\textbf{BIO sequence labelling:} Carreras et al. (2003) \cite{Carreras:2003:SNE}
\begin{itemize}
\item[\ra] best score at CONLL 2003 NER shared task
\end{itemize}

Use of a two-pass method:
\begin{enumerate}
\item \myemph{NE detection}: assign un-typed \myemph{BIO} tags \ra\ find NE boundaries \textbf{whatever the class}
\item \myemph{NE classification}: assign a class to all \myemph{BI} tags
\end{enumerate}

Use of additional features:
\begin{itemize}
\item Type pattern of consecutive word in context: functional (f), capitalized (C), lowercased (l) etc. \ra\ "John Smith payed 3 euros" \Ra\ CClxl 
\end{itemize}

\begin{block}{Use of \textbf{Adaboost} classifier:}
\begin{itemize}
\item \textbf{Adaptive Boosting} is a meta-algorithm that combines outputs of other learning algorithms (called \textbf{weak learners}).
Subsequent weak learners are tweaked in favor of the instances that are misclassified by previous classifiers.
\end{itemize}

\end{block}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: NER: Supervised Learning}

\textbf{BIO sequence labelling:} Carreras et al. (2003) \cite{Carreras:2003:SNE}

\begin{center}
\begin{tabular}{lllll}
			& \MC{2}{c}{English} 	& \MC{2}{c}{German} \\
			& Precision (\%) & Recall (\%) & Precision (\%) & Recall (\%) \\ \toprule
NE Detection 	& 91.93 		 & 94.02 		& 85.85	& 72.61 \\
NE Classification	& 84.05		 & 85.96		& 75.47	& 63.82 \\ 
\midrule
			& \MC{2}{c}{Accuracy} & \MC{2}{c}{Accuracy} \\
Oracle		& \MC{2}{c}{95.14} & \MC{2}{c}{85.14} \\
\bottomrule
\end{tabular}
\end{center}

\myemph{Oracle}: Classification with a perfect Detection

\Ra\ LOC and PER score consistently higher than ORG and MISC

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Overview}

\begin{itemize}
\item \gray{Introduction to Information Extraction}
\begin{itemize}
	\item \gray{Definition + contrast with IR}
	\item \gray{Example Applications}
	\item \gray{Overview of Tasks}
	\item \gray{Overview of Approaches}
	\item \gray{Evaluation + Shared Task Challenges}
	\item \gray{Brief(est) history of IE}
\end{itemize}

\item \textbf{Named Entity Recognition}
\begin{itemize}
	\item \gray{Task}
	\item \gray{Approaches: Rule-based, Supervised Learning}
	\item \textbf{Entity Linking}
\end{itemize}

\item Relation Extraction
\begin{itemize}
	\item Task
	\item Approaches: Rule-based, Supervised Learning, Bootstrapping, Distant Supervision
\end{itemize}
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Entity Linking}

IE is knowledge base population (\myemph{KBP})
\begin{itemize}
	\item facts are gathered from open access web sources 
	\item then used to build a structured information repository
\end{itemize}

\Ra\ To correctly assemble facts, the entities must be linked to the appropriately entry in the KB

\vfill

\begin{block}{\myemphb{Entity Linking Task}: }
Given a text with a recognised NE mention in that text and a knowledge base (KB), such as Wikipedia, link the NEs to the matching entry in the KB if there is one, else create an entry.
\end{block}
 
\vfill

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Entity Linking}


\textbf{Is this task difficult? – yes!!}
\begin{itemize}
\item Wikipedia contains over 200 entries for \myemph{John Smith}, at least 1,716 places called \myemph{San José} (or San Jose)
%\item \myemph{Ashoka Restaurant}, \myemph{ABC Taxis}, . . .
\end{itemize}

%\tikz[remember picture,overlay, right] \node[opacity=0.2,inner sep=0pt] at (current page.center){\includegraphics[height=\textheight]{wiki_john_smith}};

\source{Wikipedia}
\vspace{-.3cm}
\begin{center}
\includegraphics[width=.85\textwidth]{wiki_john_smith}\\
\end{center}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Entity Linking: Approaches}

\begin{itemize}
\item Simple idea: given a text $T$ containing an NE mention $m$ and using Wikipedia as a KB
\begin{enumerate}
\item index all pages in the KB using IR
\item build a query from $T$ containing $m$ + search the KB
\item pick the best ranked page from step 2
\end{enumerate}
\item[\ra] does not work very well

\item More successful approaches consider disambiguating all NEs jointly
\begin{itemize}
\item \textbf{Intuition}: in disambiguating a text mentioning \myemph{Ashoka} and \myemph{Sheffield},
the \myemph{Ashoka} mentioned is likely to be in Sheffield, while the Sheffield is likely to be one containing an Ashoka restaurant.
\item See, e.g., Alhelbawy and Gaizauskas (2014)
\end{itemize}

\end{itemize}

\end{frame}