Commit 62929611 authored by Loïc Barrault's avatar Loïc Barrault
Browse files

end of Rel Extract + start of Deep learning

parent c6a85a50
......@@ -112,6 +112,8 @@
\newcommand{\annot}[2]{[#1]$_{#2}$}
%\newcommand{\annot}[2]{#1\_#2}
\newcommand{\tuple}[1]{\textlangle#1\textrangle}
\newcommand{\B}[1]{B$_{#1}$}
\newcommand{\I}[1]{I$_{#1}$}
......
......@@ -480,12 +480,12 @@ Approaches to IE may be placed into four categories:
\colorbox{liumlightgray}{
\parbox{.99\textwidth}{
\myhl{brown!90}{\annot{Mr. \tikzmark{a} Wright}{PER}}, \textbf{\underline{\annot{executive vice president}{POSITION}}} of \myhl{cyan!40}{\annot{Merrill Lynch \tikzmark{b} Canada Inc.}{ORG}}\\
\myhl{brown!90}{\annot{Mr. \tikzmark{ie_a} Wright}{PER}}, \textbf{\underline{\annot{executive vice president}{POSITION}}} of \myhl{cyan!40}{\annot{Merrill Lynch \tikzmark{ie_b} Canada Inc.}{ORG}}\\
\begin{tikzpicture}[overlay,remember picture]
\draw [very thick, color=carminered] ($({pic cs:a})+(0ex,-1ex)$) -- ($({pic cs:a})+(0ex,-3ex)$);
\draw [very thick, color=carminered] ($({pic cs:a})+(0ex,-3ex)$) -- ($({pic cs:b})+(0ex,-3ex)$) node [midway, below, color=carminered] {is-employed-by};
\draw [very thick, color=carminered] ($({pic cs:b})+(0ex,-3ex)$) -- ($({pic cs:b})+(0ex,-1ex)$);
\draw [very thick, color=carminered] ($({pic cs:ie_a})+(0ex,-1ex)$) -- ($({pic cs:ie_a})+(0ex,-3ex)$);
\draw [very thick, color=carminered] ($({pic cs:ie_a})+(0ex,-3ex)$) -- ($({pic cs:ie_b})+(0ex,-3ex)$) node [midway, below, color=carminered] {is-employed-by};
\draw [very thick, color=carminered] ($({pic cs:ie_b})+(0ex,-3ex)$) -- ($({pic cs:ie_b})+(0ex,-1ex)$);
\end{tikzpicture}
}}
......@@ -519,12 +519,12 @@ Action: & \textbf{add-relation(is-employed-by(\$Person,\$Organization))} \\
\colorbox{liumlightgray}{
\parbox{.99\textwidth}{
\myhl{brown!90}{\annot{Mr. \tikzmark{a2} Wright}{PER}}, \textbf{\underline{\annot{executive vice president}{POSITION}}} of \myhl{cyan!40}{\annot{Merrill Lynch \tikzmark{b2} Canada Inc.}{ORG}}\\
\myhl{brown!90}{\annot{Mr. \tikzmark{ie_a2} Wright}{PER}}, \textbf{\underline{\annot{executive vice president}{POSITION}}} of \myhl{cyan!40}{\annot{Merrill Lynch \tikzmark{ie_b2} Canada Inc.}{ORG}}\\
\begin{tikzpicture}[overlay,remember picture]
\draw [very thick, color=carminered] ($({pic cs:a2})+(0ex,-1ex)$) -- ($({pic cs:a2})+(0ex,-3ex)$);
\draw [very thick, color=carminered] ($({pic cs:a2})+(0ex,-3ex)$) -- ($({pic cs:b2})+(0ex,-3ex)$) node [midway, below, color=carminered] {is-employed-by};
\draw [very thick, color=carminered] ($({pic cs:b2})+(0ex,-3ex)$) -- ($({pic cs:b2})+(0ex,-1ex)$);
\draw [very thick, color=carminered] ($({pic cs:ie_a2})+(0ex,-1ex)$) -- ($({pic cs:ie_a2})+(0ex,-3ex)$);
\draw [very thick, color=carminered] ($({pic cs:ie_a2})+(0ex,-3ex)$) -- ($({pic cs:ie_b2})+(0ex,-3ex)$) node [midway, below, color=carminered] {is-employed-by};
\draw [very thick, color=carminered] ($({pic cs:ie_b2})+(0ex,-3ex)$) -- ($({pic cs:ie_b2})+(0ex,-1ex)$);
\end{tikzpicture}
}}
......
......@@ -46,10 +46,10 @@
\textbf{Relation Extraction}
\begin{itemize}
\item Task definition
\item \textbf{Task definition}
\end{itemize}
\textbf{Approaches to Relation Extraction}
Approaches to Relation Extraction
\begin{itemize}
\item Knowledge-engineering approaches to RE
\item Supervised learning approaches to RE
......@@ -185,12 +185,12 @@ Task: Identify all \myemph{assertions of relations} holding between \myemph{enti
\colorbox{liumlightgray}{
\parbox{.99\textwidth}{
\annot{Mr. \tikzmark{a} Wright}{PER}, \textbf{\underline{\annot{executive vice president}{POSITION}}} of \annot{Merrill Lynch \tikzmark{b} Canada Inc.}{ORG}\\
\annot{Mr. \tikzmark{re_a} Wright}{PER}, \textbf{\underline{\annot{executive vice president}{POSITION}}} of \annot{Merrill Lynch \tikzmark{re_b} Canada Inc.}{ORG}\\
\begin{tikzpicture}[overlay,remember picture]
\draw [very thick, color=carminered] ($({pic cs:a})+(0ex,-1ex)$) -- ($({pic cs:a})+(0ex,-3ex)$);
\draw [very thick, color=carminered] ($({pic cs:a})+(0ex,-3ex)$) -- ($({pic cs:b})+(0ex,-3ex)$) node [midway, below, color=carminered] {is-employed-by};
\draw [very thick, color=carminered] ($({pic cs:b})+(0ex,-3ex)$) -- ($({pic cs:b})+(0ex,-1ex)$);
\draw [very thick, color=carminered] ($({pic cs:re_a})+(0ex,-1ex)$) -- ($({pic cs:re_a})+(0ex,-3ex)$);
\draw [very thick, color=carminered] ($({pic cs:re_a})+(0ex,-3ex)$) -- ($({pic cs:re_b})+(0ex,-3ex)$) node [midway, below, color=carminered] {is-employed-by};
\draw [very thick, color=carminered] ($({pic cs:re_b})+(0ex,-3ex)$) -- ($({pic cs:re_b})+(0ex,-1ex)$);
\end{tikzpicture}
}}
......@@ -251,13 +251,13 @@ Action: & \textbf{add-relation(is-employed-by(\$Person,\$Organization))} \\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: RE: Knowledge-engineering}
\vfill
\textbf{Strengths}
\begin{itemize}
\item High precision
\item Interpretable results \ra\ system behaviour is human-comprehensible
\end{itemize}
\vfill
\textbf{Weaknesses}
\begin{itemize}
\item The writing of rules has no end
......@@ -267,7 +267,7 @@ Action: & \textbf{add-relation(is-employed-by(\$Person,\$Organization))} \\
\item transduction rules for deep approaches
\end{itemize}
\end{itemize}
\vfill
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......@@ -357,7 +357,10 @@ Key issue: what \textbf{features} to use to represent instances?
\begin{frame}
\frametitle{Information Extraction: RE: Supervised Learning: Example}
Ex.: \annot{American Airlines}{ORG}, a unit of \annot{AMR Inc.}{ORG}, immediately matched the move, spokesman \annot{Tim Wagner}{PER} said. \source{Jurafsky and Martin, 2nd ed., p. 730}
{\scriptsize \annot{American Airlines}{ORG}, a unit of \annot{AMR Inc.}{ORG}, immediately matched the move, spokesman \annot{Tim Wagner}{PER} said. }\\
\source{Jurafsky and Martin, 2nd ed., p. 730}
\vspace{.2cm}
Features extracted from this example:
\scriptsize{
......@@ -394,6 +397,8 @@ Features extracted from this example:
\begin{frame}
\frametitle{Information Extraction: RE: Supervised Learning}
\vfill
\textbf{Strengths}
\begin{itemize}
\item No need to write extensive/complex rule sets for each domain
......@@ -401,6 +406,8 @@ Features extracted from this example:
\item[\ra] beware of data sparsity!
\end{itemize}
\vfill
\textbf{Weaknesses}
\begin{itemize}
\item Quality of relation extraction dependent on quality and quantity of training data
......@@ -408,6 +415,8 @@ Features extracted from this example:
\item Feature extractors can be noisy (e.g. parsers) \ra\ reduce overall performance
\end{itemize}
\vfill
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......@@ -468,12 +477,12 @@ Features extracted from this example:
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}
\myemphb{DIPRE} Dual Iterative Pattern Relation Expansion – proposed by Sergie Brin (1999)
\myemphb{DIPRE} Dual Iterative Pattern Relation Expansion – proposed by Sergey Brin (1999) \cite{Brin:1998}
\vfill
\textbf{Aim}: to extract useful relational tuples from the Web, of the form (\textsc{person}, \textsc{book\_title})
Ex.: (\textsc{Leo Tolstoy}, \textsc{War and Peace})
\textbf{Aim}: to extract useful relational tuples from the Web, of the form \tuple{\textsc{person}, \textsc{book\_title}}
Ex.: \tuple{\textsc{Leo Tolstoy}, \textsc{War and Peace}}
\vfill
......@@ -496,10 +505,9 @@ Ex.: (\textsc{Leo Tolstoy}, \textsc{War and Peace})
Main loop in DIPRE is the following:
\begin{enumerate}
\item $\mathcal{R}'$ \la\ Sample
\item $\mathcal{R}'$ \la\ Seed tuples
\begin{itemize}
\item[] $\mathcal{R}'$ is a set of tuples \ra\ approximation of target relation
\item[] Sample is the seed tuples (e.g. 5 author-title pairs)
\item[] $\mathcal{R}'$ contains sample instances of the target relation
\end{itemize}
\item $\mathcal{O}$ \la\ FindOccurrences($\mathcal{R}'$, $\mathcal{D}$)
......@@ -522,24 +530,454 @@ Main loop in DIPRE is the following:
\end{enumerate}
\begin{textblock*}{50mm}[0,0](103mm,7mm)
\tiny{
\begin{tabular}{ll}
\toprule
Isaac Asimov & The Robots of Dawn \\
Charles Dickens & Great Expectations \\
Leo Tolstoy & War and Peace \\
Jane Austen & Pride and Prejudice \\
Marcel Proust & In Search of Lost Time \\
\bottomrule
\end{tabular}}
\end{textblock*}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}
\begin{enumerate}
\setcounter{enumi}{1}
\item $\mathcal{O}$ \la\ FindOccurrences($\mathcal{R}'$, $\mathcal{D}$)
\begin{itemize}
\item[] $\mathcal{O}$ contains all occurrences of tuples in $\mathcal{R}'$ appearing in $\mathcal{D}$
\end{itemize}
\end{enumerate}
\begin{itemize}
\item \textbf{Occurrences} are defined as 7-tuples \myemph{\tuple{author, title, order, url, prefix, middle, suffix}}
\begin{itemize}
\item \myemph{order} is \textbf{true} (resp. \textbf{false}) for \tuple{author, title} tuples, \textbf{false} for \tuple{title, author} tuples
\item \myemph{url} is the document url containing the occurence
\item \myemph{prefix} is the $m$ characters (in tests m=10) preceding the author (or title)
\item \myemph{middle} is the text between author and title
\item \myemph{suffix} is the $m$ characters following the title (or author)
\end{itemize}
\item<2-> Ex. search for \myemphb{\tuple{Charles Dickens, Great Expectations}} in the domain \textbf{www.books.com}
\begin{itemize}
\item[\ra] URL: \textbf{www.books.com/TopRated}
\item[\ra] text: "The famous writer Charles Dickens wrote Great Expectations book"
\end{itemize}
\item<3-> Extracted occurrence:\\
\scriptsize{\tuple{The famous writer, Charles Dickens, wrote, Great Expectations, book, true, www.books.com/TopRated}}
\end{itemize}
\vfill
\begin{center}
\source{\url{https://cs.uwaterloo.ca/~kmsalem/courses/CS848W10/presentations/Anup-proj.pdf}}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}
\begin{enumerate}
\setcounter{enumi}{2}
\item $\mathcal{P}$ \la\ GenPatterns($\mathcal{O}$)
%\begin{itemize}
%\item[] $\mathcal{P}$ contains the patterns generated based on the occurrences
%\item[] Seek for low error rate patterns, ideally having high coverage
%\end{itemize}
\end{enumerate}
\begin{itemize}
\item \textbf{Patterns} are defined as 5-tuples \myemph{\tuple{prefix, middle, suffix, order, urlprefix}}
\begin{itemize}
\item \myemph{order} is \textbf{true} (resp. \textbf{false}) for \tuple{author, title} tuples, \textbf{false} for \tuple{title, author} tuples
\item the tuple matches the pattern if
\begin{itemize}
\item there is a document in the collection (web) with URL matches \myemph{urlprefix},
\item that contains text matching the RegEx /.*prefix author middle title suffix.*/
\end{itemize}
%\item more detailed RegEx are given for author and title
\end{itemize}
\item<2-> Group occurrences having similar \myemph{order} and \myemph{middle}\\
{\scriptsize \tuple{The famous writer, Charles Dickens, \myemphb{wrote}, Great Expectations, book, \myemphb{true}, www.books.com/TopRated}}\\
{\scriptsize \tuple{The great writer, Nicholas Sparks, \myemphb{wrote}, The Last Song, book, \myemphb{true}, www.books.com/BestSellers}}
\end{itemize}
\begin{itemize}
\item<3-> Generate a pattern as general as possible:
\begin{center}
/\textbf{writer .*? wrote .*? book}/ with \textbf{order}=true and \textbf{urlprefix}=www.books.com
\end{center}
\end{itemize}
\vfill
\begin{center}
\source{\url{https://cs.uwaterloo.ca/~kmsalem/courses/CS848W10/presentations/Anup-proj.pdf}}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}
\begin{enumerate}
\setcounter{enumi}{3}
\item $\mathcal{R}'$ \la\ $M_{\mathcal{D}}(\mathcal{P})$
\begin{itemize}
\item[] Update $\mathcal{R}'$ with the tuples from $\mathcal{D}$ matching patterns in $\mathcal{P}$
\end{itemize}
\end{enumerate}
\vfill
\begin{center}
\myemphb{/writer .*? wrote .*? book/} with \textbf{order}=true and \textbf{urlprefix}=www.books.com
\end{center}
\begin{itemize}
\item Result: ......The writer \textbf{Mario Puzo} wrote \textbf{The Godfather} book.....
\vfill
\item[\Ra] Extract relation \myemph{\tuple{Mario Puzo, The Godfather}}
\end{itemize}
\vfill
\begin{center}
\source{\url{https://cs.uwaterloo.ca/~kmsalem/courses/CS848W10/presentations/Anup-proj.pdf}}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}
\begin{enumerate}
\setcounter{enumi}{3}
\item Stop if $\mathcal{R}'$ is large enough, otherwise go to \circled{2}{black} $\mathcal{O}$ \la\ FindOccurrences($\mathcal{R}'$, $\mathcal{D}$)
\end{enumerate}
\vfill
\begin{itemize}
\item New tuple: \myemph{\tuple{Mario Puzo, The Godfather}}
\item New match: .... the book \textbf{The Godfather} was written by \textbf{Mario Puzo}....
\item New occurrence: \\
\tuple{the book, Mario Puzo, was written by, The Godfather, NULL, \myemphb{false}, www.library.com}
\item New pattern: \tuple{The book, was written by, NULL, false, www.library.com}
\begin{center}
\myemphb{/the book .*? was written .*?/} with \textbf{order}=false and \textbf{urlprefix}=www.library.com
\end{center}
\end{itemize}
\vfill
\begin{center}
\source{\url{https://cs.uwaterloo.ca/~kmsalem/courses/CS848W10/presentations/Anup-proj.pdf}}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping - DIPRE}
\begin{itemize}
\item Experiment on 24 million web pages
\item [\ra] \textbf{only} 5 seed tuples !
\end{itemize}
\begin{enumerate}
\item<2-> 1st iteration \ra\ 199 occurrences and 3 patterns \\
\ra\ 4047 unique \tuple{author, title} tuples
\item<3-> 2nd iteration over 5 millions web pages \ra\ 3972 occurrences \ra\ 105 patterns \\
\ra\ 9369 \tuple{author, title} tuples
\begin{itemize}
\item only manual action: some bad authors ("Conclusion") were rejected
\end{itemize}
\item<4-> Final iteration \ra\ 9988 occurrences \ra\ 346 patterns \\
\ra\ 15257 (almost) unique \tuple{author, title} tuples
\item[\Ra]<4-> list available here: \url{http://infolab.stanford.edu/~sergey/booklist.html}
\end{enumerate}
\begin{textblock*}{50mm}[0,0](90mm,5mm)
\tiny{
\begin{tabular}{ll}
\toprule
Isaac Asimov & The Robots of Dawn \\
David Brin & Startside Rising \\
James Gleick & Chaos: Making a New Science \\
Charles Dickens & Great Expectations \\
William Shakespeare & The Comedy of Errors \\
\bottomrule
\end{tabular}}
\end{textblock*}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Bootstrapping}
\vfill
\begin{itemize}
\item \textbf{Strengths}
\begin{itemize}
\item No need for manually annotated data
\item[\ra] only a few examples are required
\end{itemize}
\vfill
\item \textbf{Weaknesses}
\begin{itemize}
\item Can suffer from \myemph{semantic drift}
\begin{itemize}
\item[\ra] an erroneous pattern introduces erroneous tuples leading to erroneous patterns...
\item a confidence score on patterns and tuples can mitigate this problem
\end{itemize}
\item Works well only when significant redundancy in expressing a relation
\item Issues when multiple relations holds between the same pair of entities
\begin{itemize}
\item tuple \tuple{name, place of birth} could be mixed up with \tuple{name, place of death}
\item a sentence containing an occurrence of one of those tuple could express any of the two relations
\end{itemize}
\end{itemize}
\end{itemize}
\vfill
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Overview}
\gray{Relation Extraction}
\begin{itemize}
\item \gray{Task definition}
\end{itemize}
\textbf{Approaches to Relation Extraction}
\begin{itemize}
\item \gray{Knowledge-engineering approaches to RE}
\item \gray{Supervised learning approaches to RE}
\item \gray{Bootstrapping Approaches to RE}
\item \textbf{Distant Supervision Approaches to RE}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Distant Supervision}
\begin{block}{\textbf{Distant supervision} also referred to as \textbf{lightly supervised} machine learning}
\textbf{Aim}: reduce/eliminate the need for manually labelled data\\
\textbf{Principle}: use a structured data source $\mathcal{R}$ (e.g. a database) to label a large document collection $\mathcal{D}$ then train a standard supervised ML system
\end{block}
\vfill
\begin{enumerate}
\item Search for sentences in $\mathcal{D}$ containing the entity pairs that occurs in relation instances (tuples) in $\mathcal{R}$
\item Label these sentences as positive occurrences of the relation
\item Use the labelled sentences as training data for a standard supervised relation extractor (ML algorithm)
\end{enumerate}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Distant Supervision}
\begin{itemize}
\item Well known approach: Mintz et al. (2009)
\item They use Freebase as structured data source
\end{itemize}
\begin{center}
\tiny{
\renewcommand{\arraystretch}{0.6}% Tighter
\begin{tabular}{lll}
\toprule
Relation name & Size & Example \\
\midrule
/people/person/nationality & 281,107 & John Dugard, South Africa \\
/location/location/contains & 253,223 & Belgium, Nijlen \\
/people/person/profession & 208,888 & Dusa McDuff, Mathematician \\
/people/person/place of birth & 105,799 & Edwin Hubble, Marshfield \\
/dining/restaurant/cuisine & 86,213 & MacAyo’s Mexican Kitchen, Mexican \\
/business/business chain/location & 66,529 & Apple Inc., Apple Inc., South Park, NC \\
/biology/organism classification rank & 42,806 & Scorpaeniformes, Order \\
/film/film/genre & 40,658 & Where the Sidewalk Ends, Film noir \\
/film/film/language & 31,103 & Enter the Phoenix, Cantonese \\
/biology/organism higher classification & 30,052 & Calopteryx, Calopterygidae \\
/film/film/country & 27,217 & Turtle Diary, United States \\
/film/writer/film & 23,856 & Irving Shulman, Rebel Without a Cause \\
/film/director/film & 23,539 & Michael Mann, Collateral \\
/film/producer/film & 22,079 & Diane Eskenazi, Aladdin \\
/people/deceased person/place of death & 18,814 & John W. Kern, Asheville \\
/music/artist/origin & 18,619 & The Octopus Project, Austin \\
/people/person/religion & 17,582 & Joseph Chartrand, Catholicism \\
/book/author/works written & 17,278 & Paul Auster, Travels in the Scriptorium \\
/soccer/football position/players & 17,244 & Midfielder, Chen Tao \\
/people/deceased person/cause of death & 16,709 & Richard Daintree, Tuberculosis \\
/book/book/genre & 16,431 & Pony Soldiers, Science fiction \\
/film/film/music & 14,070 & Stavisky, Stephen Sondheim \\
/business/company/industry & 13,805 & ATS Medical, Health care \\
\bottomrule
\end{tabular}}
\end{center}
\source{Mintz et al. (2009) \cite{Mintz2009}}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Distant Supervision}
\begin{itemize}
\item \myemph{Freebase}: free online database of structured semantic data
\begin{itemize}
\item data from Wikipedia infoboxes + other open access sources
\item 102 relations connecting 940,000 entities \ra\ 1.8 millions instances
\item Freebase no longer available: bought by Google \ra\ Google Knowledge Graph (partly free / partly paid access)
\item Similar sources: \textbf{DBPedia} and \textbf{WikiData}
\end{itemize}
\item Mintz et al. (2009) use a dump of Wikipedia as their \textbf{document collection}
\begin{itemize}
\item $\sim$ 1.8 million articles with 14.3 sentences/article in average
\item 800,000 articles for training / 400,000 for testing
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Distant Supervision}
\begin{block}{Distant supervision }
\textbf{Assumption}: any sentence containing two entities that participate in a relation might express that relation \ra\ tag those sentences as mentions of the relation
\end{block}
\begin{itemize}
\item Same relation may be expressed in different ways in different sentences:
\item[] {\scriptsize \annot{Steven Spielberg}{PER} 's film \annot{Saving Private Ryan}{FILM} is loosely based on the brothers' story.}
\item[] {\scriptsize Allison co-produced the Academy Award-wining \annot{Saving Private Ryan}{FILM}, directed by \annot{Steven Spielberg}{PER}.}
\item Combine features from multiple mentions to get richer feature vector
\item Use multiclass logistic regression as a machine learning framework
\item At test time: combine all occurrences of a given entity pair, assign the most likely relation (or none)
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Distant Supervision}
\vfill
\begin{itemize}
\item Need for \myemph{negative instance} \ra\ an 'unrelated' relation
\begin{itemize}
\item Randomly select entity pairs not appearing in Freebase relations, extract features from them
\item Rare cases: the relation could be wrongly omitted from Freebase \ra\ low effect on performance
\end{itemize}
\vfill
\item Evaluation
\begin{itemize}
\item Human evaluation of highest 100 and 1000 results per relation for 10 relations
\item Avg. precision for best feature combinations: 69\% for top 100 and 68\% for top 1000
\item Competitive results with knowledge engineering and supervised learning methods
\end{itemize}
\end{itemize}
\vfill
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Distant Supervision}
\vfill
\begin{itemize}
\item \textbf{Strengths}
\begin{itemize}
\item Need for manually labelled training data is eliminated
\item[\ra] still need some expert linguistic resources (e.g. Freebase)
\item Can very rapidly get extractors for a wide range of relations
\end{itemize}
\vfill
\item \textbf{Weaknesses}
\begin{itemize}
\item Precision still a bit behind best knowledge-engineering/supervised ML approaches
\item Requires an as large as possible structured database for the relations of interest.
\end{itemize}
\end{itemize}
\vfill
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation Extraction: Conclusion}
\begin{block}{\myemph{Relation extraction}}
aims to \textbf{detect} and \textbf{classify} all mentions of a \myemphb{given set of relations} holding between \myemphb{specified entities} within a given text
\end{block}
\begin{itemize}
\item Core IE technology, \textbf{very difficult} due to the \textbf{high variabilities} of relation expressions in natural language
\item Presented four different approaches:
\begin{itemize}
\item Knowledge engineering, supervised machine learning, bootstrapping and distant supervision
\end{itemize}
\item \textbf{Open challenges}
\begin{itemize}