ie_relation_extraction.tex 33.4 KB
Newer Older
Loïc Barrault's avatar
ie  
Loïc Barrault committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
% !TEX root = text_processing.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{}

\vfill
\centering
\Huge{\edinred{[Information Extraction]\\Relation Extraction}}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Overview}

\begin{itemize}
\item \gray{Introduction to Information Extraction}
\begin{itemize}
	\item \gray{Definition + contrast with IR}
	\item \gray{Example Applications}
	\item \gray{Overview of Tasks}
	\item \gray{Overview of Approaches}
	\item \gray{Evaluation + Shared Task Challenges}
	\item \gray{Brief(est) history of IE}
\end{itemize}

\item \gray{Named Entity Recognition}
\begin{itemize}
	\item \gray{Task}
	\item \gray{Approaches: Rule-based, Supervised Learning}
	\item \gray{Entity Linking}
\end{itemize}

\item \textbf{Relation Extraction}
\begin{itemize}
	\item \textbf{Task}
	\item \textbf{Approaches: Rule-based, Supervised Learning, Bootstrapping, Distant Supervision}
\end{itemize}
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Overview}

\textbf{Relation Extraction}
\begin{itemize}
49
\item \textbf{Task definition}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
50
51
\end{itemize}

52
Approaches to Relation Extraction
Loïc Barrault's avatar
ie  
Loïc Barrault committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
\begin{itemize}
\item Knowledge-engineering approaches to RE
\item Supervised learning approaches to RE
\item Bootstrapping Approaches to RE
\item Distant Supervision Approaches to RE
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction - recap}

\begin{block}{\textbf{Relation Extraction (RE)}}
Task: Identify all \myemph{assertions of relations} holding between \myemph{entities} in a text $T$\\
\begin{itemize}
\item the entities are identified in previous entity extraction step
\item The set of possible relations $\mathbf{R}$ is determined in advance
\end{itemize}
\end{block}

\textbf{Note:}
\begin{itemize}
\item relations in $\mathbf{R}$ are usually binary
\item the entity classes involved in a relation in $\mathbf{R}$ are assumed to be a subset of those identified in the entity extraction process
\end{itemize}

\textbf{Generally divided into two subtasks:}
\begin{enumerate}
\item \myemph{Relation detection}: find pairs of entities between which a relation holds
\item \myemph{Relation classification}: determine the type of a previously detected relation 
\end{enumerate}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Examples}

\textbf{Some examples}

\begin{itemize}
\item \textsc{location\_of} holding between:
\begin{itemize}
	\item \textsc{organisation} and \textsc{geopolitical\_location}
	\item medical \textsc{investigation} and \textsc{ body\_part}
	\item \textsc{gene} and \textsc{chromosome\_location}
\end{itemize}

\item \textsc{employee\_of} holding between \textsc{person} and \textsc{organisation}\\
\item \textsc{product\_of} holding between \textsc{artifact} and \textsc{organisation}\\
\item \textsc{is\_exposed\_to} holding between \textsc{organisation} and \textsc{risk}\\
\item \textsc{is\_associated\_with} holding between \textsc{drug} and \textsc{side\_effect}\\
\item \textsc{interaction} holding between \textsc{protein} and \textsc{protein}\\
\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: RE: Challenges}

\textbf{Challenges for Relation Extraction}
\begin{enumerate}
\item<1-> Many different ways to express the same relation
\begin{itemize}
\item \myemph{Canonical}: \annot{Microsoft}{ORG} \textbf{\underline{is located in}} \annot{Redmond}{LOC}
\item \myemph{Synonyms}: \annot{Microsoft}{ORG} \textbf{\underline{is located/based/headquartered in}}  \annot{Redmond}{LOC}
\item \myemph{Syntactic variations}:
\begin{itemize}
	\item \annot{Microsoft}{ORG}, the software giant and ... , \textbf{\underline{is based in}} \annot{Redmond}{LOC}
	\item \annot{Redmond}{LOC}\textbf{\underline{-based}} \annot{Microsoft}{ORG} ...
	\item \annot{Redmond}{LOC}\textbf{\underline{'s}} \annot{Microsoft}{ORG} ...
	\item \annot{Redmond}{LOC} software giant \annot{Microsoft}{ORG} ...
\end{itemize}
\end{itemize}

\item<2-> Relations often involve coreference links\\
\small{
\myemph{\annot{Bill Gates}{PER}} \textbf{\underline{is co-founder, technology advisor and board member}} of \annot{Microsoft}{ORG}. \myemph{\annot{He}{PER}} \textbf{\underline{served as chairman of the board}} until Feb. 4, 2014. On June 27, 2008, \myemph{\annot{Gates}{PER}} \textbf{\underline{transitioned out}} of a day-to-day role in the company to spend more time on his global health and education work at the \annot{Bill \& Melinda Gates Foundation}{ORG}.
}
\end{enumerate}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: RE: Challenges}

\textbf{Challenges for Relation Extraction}
\begin{enumerate}
\setcounter{enumi}{2}
\item The information required may be spread across multiple sentences
\item The information may be implied by the text rather than explicitly asserted \ra\ \myemphb{inference}
\end{enumerate}

\textbf{Ex1.:}
\myemph{\annot{Dirk Ruthless}{PER}} of \annot{MegaCorp}{ORG} made a stunning announcement today. In September \myemph{\annot{he}{PER}} will be stepping down as \annot{Chief Executive Officer}{POS} to spend more time with his pet piranhas.

\begin{itemize}
\item[\Ra] resolve \textbf{pronominal anaphor}  \myemph{Dirk Ruthless} \Lra\ \myemph{he} to determine the corporate \textsc{position}
\item[\Ra] In Ex1.: no explicit statement that \myemph{Dirk Ruthless} \textbf{is} CEO of MegaCorp
\begin{itemize}
	\item "\myemph{\annot{Dirk Ruthless}{PER}} of \annot{MegaCorp}{ORG}"  + "will be stepping down as \annot{Chief Executive Officer}{POS}" = \myemph{\annot{Dirk Ruthless}{PER}} is CEO of \annot{MegaCorp}{ORG}
\end{itemize}
\item Solving RE may imply solving \myemphb{textual entailment}
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Overview}

\gray{Relation Extraction}
\begin{itemize}
\item \gray{Task definition}
\end{itemize}

\textbf{Approaches to Relation Extraction}
\begin{itemize}
\item \textbf{Knowledge-engineering approaches to RE}
\item Supervised learning approaches to RE
\item Bootstrapping Approaches to RE
\item Distant Supervision Approaches to RE
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: RE: Knowledge-engineering}

\colorbox{liumlightgray}{
\parbox{.99\textwidth}{

188
\annot{Mr. \tikzmark{re_a} Wright}{PER}, \textbf{\underline{\annot{executive vice president}{POSITION}}} of \annot{Merrill Lynch \tikzmark{re_b} Canada Inc.}{ORG}\\
Loïc Barrault's avatar
ie  
Loïc Barrault committed
189
190

\begin{tikzpicture}[overlay,remember picture]
191
192
193
 \draw [very thick, color=carminered] ($({pic cs:re_a})+(0ex,-1ex)$) -- ($({pic cs:re_a})+(0ex,-3ex)$);
 \draw [very thick, color=carminered] ($({pic cs:re_a})+(0ex,-3ex)$) -- ($({pic cs:re_b})+(0ex,-3ex)$)  node [midway, below, color=carminered] {is-employed-by};
 \draw [very thick, color=carminered] ($({pic cs:re_b})+(0ex,-3ex)$) -- ($({pic cs:re_b})+(0ex,-1ex)$);
Loïc Barrault's avatar
ie  
Loïc Barrault committed
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
\end{tikzpicture}

}}

\vfill

Such systems use manually authored rules and can be divided into
\begin{itemize}
\item<1-> “\myemph{shallow}”: systems engineered to the IE task, typically using \myemphb{pattern-action} rules
\begin{center}
\begin{tabular}{ll}
Pattern: & \textbf{"\$Person, \$Position of \$Organization"} \\
Action: & \textbf{add-relation(is-employed-by(\$Person,\$Organization))} \\
\end{tabular}
\end{center}
\vspace{.5cm}
\item<2-> “\myemph{deep}”: linguistically inspired language understanding systems
\end{itemize}

\vfill

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: RE: Knowledge-engineering}

\begin{itemize}
\item\myemph{deep}”: linguistically inspired language understanding systems
\begin{itemize}
	\item parse input to identify key grammatical relations (e.g. subject, object)
	\item use transduction rules on parser output to extract relations 
	\item[\ra] more powerful than regex on NE tags alone
\end{itemize}
\end{itemize}
\vspace{3.5cm}

\only<2->{
\begin{itemize}
\item Ex. parse trees:
\begin{itemize}
\item Multiple surface forms share underlying syntactic structure
\item \textsc{subject} = PER, \textsc{object} = ORG and \textsc{verb} = \textbf{works for}
\end{itemize}
\end{itemize}

\begin{textblock*}{50mm}[0,0](115mm,25mm)
\includegraphics[height=.45\textheight]{syntaxtree1}
\end{textblock*}

\begin{textblock*}{50mm}[0,0](55mm,25mm)
\includegraphics[height=.45\textheight]{syntaxtree2}
\end{textblock*}
}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: RE: Knowledge-engineering}
254
\vfill
Loïc Barrault's avatar
ie  
Loïc Barrault committed
255
256
257
258
259
\textbf{Strengths}
\begin{itemize}
\item High precision
\item Interpretable results \ra\ system behaviour is human-comprehensible
\end{itemize}
260
\vfill
Loïc Barrault's avatar
ie  
Loïc Barrault committed
261
262
263
264
265
266
267
268
269
\textbf{Weaknesses}
\begin{itemize}
\item The writing of rules has no end
\item New rules needed for every new domain 
\begin{itemize}
	\item pattern action rules for shallow approaches
	\item transduction rules for deep approaches
\end{itemize}
\end{itemize}
270
\vfill
Loïc Barrault's avatar
ie  
Loïc Barrault committed
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Overview}

\gray{Relation Extraction}
\begin{itemize}
\item \gray{Task definition}
\end{itemize}

\textbf{Approaches to Relation Extraction}
\begin{itemize}
\item \gray{Knowledge-engineering approaches to RE}
\item \textbf{Supervised learning approaches to RE}
\item Bootstrapping Approaches to RE
\item Distant Supervision Approaches to RE
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: RE: Supervised learning}

\textbf{Question: what is to be learned?}

Answer 1: \myemph{rules}
\begin{itemize}
\item to match \textbf{relation bearing sentences}
\item capture the \textbf{relation arguments} in the matched text
\end{itemize}

Answer 2: \myemph{binary classifier}
\begin{itemize}
\item Classifies a sentence as to whether it bears a specific relation between some entity types
\item Specialized binary classifier
\item Can be divided in two stages: 
\begin{itemize}
\item \myemph{relation detection}: determines whether a sentence expresses a relation (binary)
\item \myemph{relation classification}: determines the relation (multi-way)
\end{itemize}

\item Rule learning popular until early 2000's. Then classifier approach.
\item[\ra] Details on classifier approach only
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: RE: Supervised Learning}

Classification approaches to relation extraction:
\begin{itemize}
\item Assume entities  are already tagged \ra\ output of NER
\item Use an algorithm to learn binary classifiers to distinguish instances where
\begin{itemize}
\item entities co-occur and relation holds (positive instances)
\item entities co-occur and relation does not hold (negative instances)
\end{itemize}
\end{itemize}

Key issue: what \textbf{features} to use to represent instances?
\begin{itemize}
\item 3 broad classes:
\begin{itemize}
\item Features of the Named Entities
\item Features from the words in the text
\begin{itemize}
	\item words between the two NE 
	\item words surrounding the candidates (left of 1st word and right of 2nd word)
\end{itemize}
\item Features about the entity pair within the sentence, e.g.
\begin{itemize}
	\item distance between the entities (in words or constituents)
	\item is there a NE in between them?
	\item clues from the syntactic structure of the sentence (parse tree)
\end{itemize}
\end{itemize}

\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: RE: Supervised Learning: Example}

360
361
362
363
{\scriptsize \annot{American Airlines}{ORG}, a unit of \annot{AMR Inc.}{ORG}, immediately matched the move, spokesman \annot{Tim Wagner}{PER} said. }\\ 
\source{Jurafsky and Martin, 2nd ed., p. 730}

\vspace{.2cm}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399

Features extracted from this example:
\scriptsize{
\begin{center}
\setlength\extrarowheight{-3pt}
\begin{tabular}{ll}
\toprule
\textbf{Features} & \textbf{Value}  \\ 
\midrule
\MC{2}{l}{\textbf{Entity-based features} }\\
~~~~~~~~Entity$_1$ type			& ORG\\
~~~~~~~~Entity$_1$ head 	& \textit{airlines}\\
~~~~~~~~Entity$_2$ type		& PERS \\
~~~~~~~~Entity$_2$ head 		& \textit{Wagner}\\
~~~~~~~~Concatenated types		& ORGPERS \\ %\midrule
\MC{2}{l}{\textbf{Word-based features} }\\
~~~~~~~~Between-entity BOW 	& \{ \textit{a, unit, of, AMR, Inc., immediately, matched, the, move, spokesman} \}\\
~~~~~~~~Word(s) before Entity$_1$ 	& NONE\\
~~~~~~~~Word(s) before Entity$_2$		& \textit{said}\\ %\midrule
\MC{2}{l}{\textbf{Syntactic features} }\\
~~~~~~~~Constituent path	& NP \ua\ NP \ua\ S \ua\ S \da\ NP\\
~~~~~~~~Base syntactic chunk path & NP \ra\ NP \ra\ PP \ra\ NP \ra\ VP \ra\ NP \ra\ NP \\
~~~~~~~~Typed-dependency path & \textit{Airlines} \la\ $_{subj}$ \textit{matched} \la\ $_{comp}$ \textit{said} \ra\ $_{subj}$ \textit{Wagner} \\
\bottomrule
\end{tabular}

\source{Jurafsky and Martin, 2nd ed., p. 730}
\end{center}
}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: RE: Supervised Learning}

400
401
\vfill

Loïc Barrault's avatar
ie  
Loïc Barrault committed
402
403
404
405
406
407
408
\textbf{Strengths}
\begin{itemize}
\item No need to write extensive/complex rule sets for each domain
\item System can adapt to any new domain \ra\ provided that training data is supplied
\item[\ra] beware of data sparsity!
\end{itemize}

409
410
\vfill

Loïc Barrault's avatar
ie  
Loïc Barrault committed
411
412
413
414
415
416
417
\textbf{Weaknesses}
\begin{itemize}
\item Quality of relation extraction dependent on quality and quantity of training data
\item[\ra] can be difficult, costly and time consuming to generate
\item Feature extractors can be noisy (e.g. parsers) \ra\ reduce overall performance
\end{itemize}

418
419
\vfill

Loïc Barrault's avatar
ie  
Loïc Barrault committed
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Overview}

\gray{Relation Extraction}
\begin{itemize}
\item \gray{Task definition}
\end{itemize}

\textbf{Approaches to Relation Extraction}
\begin{itemize}
\item \gray{Knowledge-engineering approaches to RE}
\item \gray{Supervised learning approaches to RE}
\item \textbf{Bootstrapping Approaches to RE}
\item Distant Supervision Approaches to RE
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping}

\textbf{Motivation}: reduce the number of manually labelled examples needed to build a system

\textbf{Requirements}: 
\begin{enumerate}
\item a document collection $\mathcal{D}$
\item set of trusted tuples \textbf{T}, also called \myemph{seed tuples}
\item set of trusted patterns \textbf{P}, also called \myemph{seed patterns}
\end{enumerate}

\textbf{Principle}:
\begin{enumerate}
%\setcounter{enumi}{3}
\item Find tuples from \textbf{T} in $\mathcal{D}$ \Ra\ extract patterns, add them to \textbf{P}
\item Match patterns \textbf{P} in $\mathcal{D}$ \Ra\ extract tuples, add them to \textbf{T}
\item[\ra] Rinse, repeat
\end{enumerate}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping}

\begin{center}
\includegraphics[width=.55\textwidth]{pattern_based_RE}

\source{Jurafsky and Martin, 2nd ed., p. 740}
\end{center}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}

480
\myemphb{DIPRE} Dual Iterative Pattern Relation Expansion – proposed by Sergey Brin (1999) \cite{Brin:1998}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
481
482
483

\vfill

484
485
\textbf{Aim}: to extract useful relational tuples from the Web, of the form \tuple{\textsc{person}, \textsc{book\_title}}
Ex.: \tuple{\textsc{Leo Tolstoy}, \textsc{War and Peace}}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507

\vfill

\textbf{Method}:
\begin{itemize}
\item Exploit duality of patterns and relations
\begin{itemize}
\item Good tuples help find good patterns
\item Good patterns help find good tuples
\end{itemize}

\item Use the bootstrapping method starting with user-supplied tuples
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}

Main loop in DIPRE is the following:
\begin{enumerate}
508
\item $\mathcal{R}'$ \la\ Seed tuples
Loïc Barrault's avatar
ie  
Loïc Barrault committed
509
\begin{itemize}
510
\item[] $\mathcal{R}'$ contains sample instances of the target relation
Loïc Barrault's avatar
ie  
Loïc Barrault committed
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
\end{itemize}

\item $\mathcal{O}$ \la\ FindOccurrences($\mathcal{R}'$, $\mathcal{D}$)
\begin{itemize}
\item[] $\mathcal{O}$ contains all occurrences of tuples in $\mathcal{R}'$ appearing in $\mathcal{D}$
\end{itemize}

\item $\mathcal{P}$ \la\ GenPatterns($\mathcal{O}$)
\begin{itemize}
\item[] $\mathcal{P}$ contains the patterns generated based on the occurrences
\item[] Seek for low error rate patterns, ideally having high coverage
\end{itemize}

\item $\mathcal{R}'$ \la\ $M_{\mathcal{D}}(\mathcal{P})$
\begin{itemize}
\item[] Update $\mathcal{R}'$ with the tuples from $\mathcal{D}$ matching patterns in $\mathcal{P}$
\end{itemize}

\item Stop if $\mathcal{R}'$ is large enough, otherwise go to \circled{2}{black}

\end{enumerate}

533
534
535
536
537
538
539
540
541
542
\begin{textblock*}{50mm}[0,0](103mm,7mm)
\tiny{
\begin{tabular}{ll}
\toprule
Isaac Asimov & The Robots of Dawn \\
Charles Dickens & Great Expectations \\
Leo Tolstoy & War and Peace \\
Jane Austen & Pride and Prejudice \\
Marcel Proust & In Search of Lost Time \\
\bottomrule
543
544
\end{tabular}
}
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
\end{textblock*}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}

\begin{enumerate}
\setcounter{enumi}{1}
\item $\mathcal{O}$ \la\ FindOccurrences($\mathcal{R}'$, $\mathcal{D}$)
\begin{itemize}
\item[] $\mathcal{O}$ contains all occurrences of tuples in $\mathcal{R}'$ appearing in $\mathcal{D}$
\end{itemize}
\end{enumerate}


\begin{itemize}
\item \textbf{Occurrences} are defined as 7-tuples \myemph{\tuple{author, title, order, url, prefix, middle, suffix}}
\begin{itemize}
\item \myemph{order} is \textbf{true} (resp. \textbf{false}) for \tuple{author, title} tuples, \textbf{false} for \tuple{title, author} tuples
\item \myemph{url} is the document url containing the occurence
\item \myemph{prefix} is the $m$ characters (in tests m=10) preceding the author (or title)
\item \myemph{middle} is the text between author and title
\item \myemph{suffix} is the $m$ characters following the title (or author)
\end{itemize}

\item<2-> Ex. search for \myemphb{\tuple{Charles Dickens, Great Expectations}}  in the domain \textbf{www.books.com}
\begin{itemize}
	\item[\ra] URL: \textbf{www.books.com/TopRated}
	\item[\ra] text: "The famous writer Charles Dickens wrote Great Expectations book"
\end{itemize}

\item<3-> Extracted occurrence:\\
\scriptsize{\tuple{The famous writer, Charles Dickens, wrote, Great Expectations, book, true, www.books.com/TopRated}}
\end{itemize}

\vfill

\begin{center}
\source{\url{https://cs.uwaterloo.ca/~kmsalem/courses/CS848W10/presentations/Anup-proj.pdf}}
\end{center}

\end{frame}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}


\begin{enumerate}
\setcounter{enumi}{2}
\item $\mathcal{P}$ \la\ GenPatterns($\mathcal{O}$)
%\begin{itemize}
%\item[] $\mathcal{P}$ contains the patterns generated based on the occurrences
%\item[] Seek for low error rate patterns, ideally having high coverage
%\end{itemize}
\end{enumerate}

\begin{itemize}
\item \textbf{Patterns} are defined as 5-tuples \myemph{\tuple{prefix, middle, suffix, order, urlprefix}}
\begin{itemize}
\item \myemph{order} is \textbf{true} (resp. \textbf{false}) for \tuple{author, title} tuples, \textbf{false} for \tuple{title, author} tuples
\item the tuple matches the pattern if 
	\begin{itemize}
	\item there is a document in the collection (web) with URL matches \myemph{urlprefix}, 
	\item that contains text matching the RegEx /.*prefix author middle title suffix.*/
	\end{itemize}
%\item more detailed RegEx are given for author and title
\end{itemize}

\item<2-> Group occurrences having similar \myemph{order} and \myemph{middle}\\
{\scriptsize \tuple{The famous writer, Charles Dickens, \myemphb{wrote}, Great Expectations, book, \myemphb{true}, www.books.com/TopRated}}\\
{\scriptsize \tuple{The great writer, Nicholas Sparks, \myemphb{wrote}, The Last Song, book, \myemphb{true}, www.books.com/BestSellers}}
\end{itemize}

\begin{itemize}
\item<3-> Generate a pattern  as general as possible:
\begin{center}
/\textbf{writer .*? wrote .*? book}/ with \textbf{order}=true and \textbf{urlprefix}=www.books.com
\end{center}
\end{itemize}

\vfill

\begin{center}
\source{\url{https://cs.uwaterloo.ca/~kmsalem/courses/CS848W10/presentations/Anup-proj.pdf}}
\end{center}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}

\begin{enumerate}
\setcounter{enumi}{3}
\item $\mathcal{R}'$ \la\ $M_{\mathcal{D}}(\mathcal{P})$
\begin{itemize}
\item[] Update $\mathcal{R}'$ with the tuples from $\mathcal{D}$ matching patterns in $\mathcal{P}$
\end{itemize}
\end{enumerate}

\vfill

\begin{center}
\myemphb{/writer .*? wrote .*? book/} with \textbf{order}=true and \textbf{urlprefix}=www.books.com
\end{center}

\begin{itemize}
\item Result:  ......The writer \textbf{Mario Puzo} wrote \textbf{The Godfather} book.....

\vfill

\item[\Ra] Extract relation \myemph{\tuple{Mario Puzo, The Godfather}}

\end{itemize}

\vfill

\begin{center}
\source{\url{https://cs.uwaterloo.ca/~kmsalem/courses/CS848W10/presentations/Anup-proj.pdf}}
\end{center}

Loïc Barrault's avatar
ie  
Loïc Barrault committed
671
672
\end{frame}

673
674
675
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
676

677
678
679
680
681
682
\begin{enumerate}
\setcounter{enumi}{3}
\item Stop if $\mathcal{R}'$ is large enough, otherwise go to \circled{2}{black} $\mathcal{O}$ \la\ FindOccurrences($\mathcal{R}'$, $\mathcal{D}$)
\end{enumerate}

\vfill
Loïc Barrault's avatar
ie  
Loïc Barrault committed
683

684
685
686
687
688
689
690
691
692
693
\begin{itemize}
\item New tuple: \myemph{\tuple{Mario Puzo, The Godfather}}
\item New match: .... the book \textbf{The Godfather} was written by \textbf{Mario Puzo}....
\item New occurrence: \\
\tuple{the book, Mario Puzo, was written by, The Godfather, NULL, \myemphb{false}, www.library.com}
\item New pattern: \tuple{The book, was written by, NULL, false, www.library.com}
\begin{center}
	\myemphb{/the book .*? was written .*?/} with \textbf{order}=false and \textbf{urlprefix}=www.library.com
\end{center}
\end{itemize}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
694

695
\vfill
Loïc Barrault's avatar
ie  
Loïc Barrault committed
696

697
698
699
\begin{center}
\source{\url{https://cs.uwaterloo.ca/~kmsalem/courses/CS848W10/presentations/Anup-proj.pdf}}
\end{center}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
700

701
\end{frame}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
702

703
704
705
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
706

707
708
709
710
\begin{itemize}
\item Experiment on 24 million web pages 
\item [\ra] \textbf{only} 5 seed tuples !
\end{itemize}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
711

712
713
714
715
716
717
718
719
720
721
722
\begin{enumerate}
\item<2-> 1st iteration \ra\ 199 occurrences and 3 patterns \\
  \ra\  4047 unique \tuple{author, title} tuples
\item<3-> 2nd iteration over 5 millions web pages \ra\ 3972 occurrences \ra\ 105 patterns \\
  \ra\ 9369 \tuple{author, title} tuples
\begin{itemize}
\item only manual action: some bad authors ("Conclusion") were rejected
\end{itemize}
\item<4-> Final iteration \ra\ 9988 occurrences \ra\ 346 patterns \\ 
\ra\ 15257 (almost) unique \tuple{author, title} tuples
\item[\Ra]<4-> list available here: \url{http://infolab.stanford.edu/~sergey/booklist.html}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
723

724
\end{enumerate}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
725

726
727
728
729
730
731
732
733
734
735
736
737
\begin{textblock*}{50mm}[0,0](90mm,5mm)
\tiny{
\begin{tabular}{ll}
\toprule
Isaac Asimov & The Robots of Dawn \\
David Brin & Startside Rising \\
James Gleick & Chaos: Making a New Science \\
Charles Dickens & Great Expectations \\
William Shakespeare & The Comedy of Errors \\
\bottomrule
\end{tabular}}
\end{textblock*}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
738

739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping}

\vfill

\begin{itemize}
\item \textbf{Strengths}
\begin{itemize}
\item No need for manually annotated data
\item[\ra] only a few examples are required
\end{itemize}

\vfill

\item \textbf{Weaknesses}
\begin{itemize}
	\item Can suffer from \myemph{semantic drift}
	\begin{itemize}
 		\item[\ra] an erroneous pattern introduces erroneous tuples leading to erroneous patterns...
		\item a confidence score on patterns and tuples can mitigate this problem
	\end{itemize}
	\item Works well only when significant redundancy in expressing a relation
	\item Issues when multiple relations holds between the same pair of entities
	\begin{itemize}
		\item tuple \tuple{name, place of birth} could be mixed up with \tuple{name, place of death}
		\item a sentence containing an occurrence of one of those tuple could express any of the two relations
	\end{itemize}
\end{itemize}
\end{itemize}

\vfill

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Overview}

\gray{Relation Extraction}
\begin{itemize}
\item \gray{Task definition}
\end{itemize}

\textbf{Approaches to Relation Extraction}
\begin{itemize}
\item \gray{Knowledge-engineering approaches to RE}
\item \gray{Supervised learning approaches to RE}
\item \gray{Bootstrapping Approaches to RE}
\item \textbf{Distant Supervision Approaches to RE}
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Distant Supervision}

\begin{block}{\textbf{Distant supervision} also referred to as \textbf{lightly supervised} machine learning}
\textbf{Aim}: reduce/eliminate the need for manually labelled data\\
\textbf{Principle}: use a structured data source $\mathcal{R}$ (e.g. a database) to label a large document collection $\mathcal{D}$ then train a standard supervised ML system
\end{block}
\vfill
\begin{enumerate}
\item Search for sentences in $\mathcal{D}$ containing the entity pairs that occurs in relation instances (tuples) in $\mathcal{R}$
\item Label these sentences as positive occurrences of the relation
\item Use the labelled sentences as training data for a standard supervised relation extractor (ML algorithm)
\end{enumerate}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Distant Supervision}

\begin{itemize}
\item Well known approach: Mintz et al. (2009)
\item They use Freebase as structured data source
\end{itemize}

\begin{center}
\tiny{
\renewcommand{\arraystretch}{0.6}% Tighter
\begin{tabular}{lll}
\toprule
Relation name & Size & Example \\
\midrule
/people/person/nationality 				 	& 281,107	& John Dugard, South Africa			\\
/location/location/contains  				& 253,223	& Belgium, Nijlen					\\
/people/person/profession  				& 208,888	& Dusa McDuff, Mathematician			\\
/people/person/place of birth  				& 105,799	& Edwin Hubble, Marshfield			\\
/dining/restaurant/cuisine  					& 86,213	& MacAyo’s Mexican Kitchen, Mexican 	\\
/business/business chain/location  			& 66,529	& Apple Inc., Apple Inc., South Park, NC 	\\
/biology/organism classification rank  		& 42,806	& Scorpaeniformes, Order				\\
/film/film/genre 							& 40,658	& Where the Sidewalk Ends, Film noir 	\\
/film/film/language 						& 31,103	& Enter the Phoenix, Cantonese 		\\
/biology/organism higher classification  		& 30,052	& Calopteryx, Calopterygidae			\\
/film/film/country 						& 27,217	& Turtle Diary, United States			\\
/film/writer/film 							& 23,856	& Irving Shulman, Rebel Without a Cause \\
/film/director/film 						& 23,539	& Michael Mann, Collateral			\\	
/film/producer/film 						& 22,079	& Diane Eskenazi, Aladdin			\\	
/people/deceased person/place of death  		& 18,814	& John W. Kern, Asheville				\\	
/music/artist/origin  						& 18,619	& The Octopus Project, Austin			\\
/people/person/religion  					& 17,582	& Joseph Chartrand, Catholicism		\\
/book/author/works written  				& 17,278	& Paul Auster, Travels in the Scriptorium 	\\
/soccer/football position/players  			& 17,244	& Midfielder, Chen Tao				\\
/people/deceased person/cause of death  		& 16,709	& Richard Daintree, Tuberculosis		\\
/book/book/genre 						& 16,431	& Pony Soldiers, Science fiction 		\\
/film/film/music  						& 14,070	& Stavisky, Stephen Sondheim			\\
/business/company/industry 				& 13,805 	& ATS Medical, Health care			\\
\bottomrule
\end{tabular}}
\end{center}

\source{Mintz et al. (2009) \cite{Mintz2009}}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Distant Supervision}

\begin{itemize}
\item \myemph{Freebase}: free online database of structured semantic data
\begin{itemize}
\item data from Wikipedia infoboxes + other open access sources
\item 102 relations connecting 940,000 entities \ra\ 1.8 millions instances
\item Freebase no longer available: bought by Google \ra\ Google Knowledge Graph (partly free / partly paid access)
\item Similar sources: \textbf{DBPedia} and \textbf{WikiData}
\end{itemize}

\item Mintz et al. (2009) use a dump of Wikipedia as their \textbf{document collection}
\begin{itemize}
\item $\sim$ 1.8 million articles with 14.3 sentences/article in average
\item 800,000 articles for training / 400,000 for testing 
\end{itemize}

\end{itemize}
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Distant Supervision}

\begin{block}{Distant supervision }
\textbf{Assumption}: any sentence containing two entities that participate in a relation might express that relation \ra\ tag those sentences as mentions of the relation
\end{block}

\begin{itemize}
\item Same relation may be expressed in different ways in different sentences:
\item[] {\scriptsize \annot{Steven Spielberg}{PER} 's film \annot{Saving Private Ryan}{FILM} is loosely based on the brothers' story.}
\item[] {\scriptsize Allison co-produced the Academy Award-wining \annot{Saving Private Ryan}{FILM}, directed by \annot{Steven Spielberg}{PER}.}

\item Combine features from multiple mentions to get richer feature vector
\item Use multiclass logistic regression as a machine learning framework
\item At test time: combine all occurrences of a given entity pair, assign the most likely relation (or none)
\end{itemize}

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Distant Supervision}

\vfill

\begin{itemize}
\item Need for \myemph{negative instance} \ra\ an 'unrelated' relation
\begin{itemize}
\item Randomly select entity pairs not appearing in Freebase relations, extract features from them
\item Rare cases: the relation could be wrongly omitted from Freebase \ra\ low effect on performance
\end{itemize}

\vfill

\item Evaluation
\begin{itemize}
\item Human evaluation of highest 100 and 1000 results per relation for 10 relations
\item Avg. precision for best feature combinations: 69\% for top 100 and 68\% for top 1000
\item Competitive results with knowledge engineering and supervised learning methods  
\end{itemize}
\end{itemize}

\vfill

\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Distant Supervision}

\vfill
\begin{itemize}
\item \textbf{Strengths}
\begin{itemize}
\item Need for manually labelled training data is eliminated
\item[\ra] still need some expert linguistic resources (e.g. Freebase)
\item Can very rapidly get extractors for a wide range of relations
\end{itemize}
\vfill
\item \textbf{Weaknesses}
\begin{itemize}
\item Precision still a bit behind best knowledge-engineering/supervised ML approaches
\item Requires an as large as possible structured database for the relations of interest.
\end{itemize}
\end{itemize}
\vfill
\end{frame}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Conclusion}

\begin{block}{\myemph{Relation extraction}} 
aims to \textbf{detect} and \textbf{classify} all mentions of a \myemphb{given set of relations} holding between \myemphb{specified entities} within a given text
\end{block}


\begin{itemize}
\item Core IE technology, \textbf{very difficult} due to the \textbf{high variabilities} of relation expressions in natural language
\item Presented four different approaches:
\begin{itemize}
\item Knowledge engineering, supervised machine learning, bootstrapping and distant supervision
\end{itemize}
\item \textbf{Open challenges}
\begin{itemize}
\item improve precision and recall
\item hande relations expressed over several sentences
\item handle textual entailment
\item improve bootstrapping methods to avoid the \myemph{semantic drift}
\item develop relation extractors for languages other than English (e.g. under-resourced languages)
\end{itemize}

\end{itemize}

\end{frame}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
976
977
978
979
980





981
\nocite{Agichtein:2000,Jurafsky:2009}
Loïc Barrault's avatar
ie  
Loïc Barrault committed
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000