Commit 2adabb29 authored by Loïc Barrault's avatar Loïc Barrault
Browse files

Added IE + some macros

parent da245c5c
......@@ -11,6 +11,7 @@
\usepackage[T1]{fontenc}
\usepackage{bbding}
\usepackage{multirow}
\usepackage{bm}
\usepackage[absolute,showboxes,overlay]{textpos}
\textblockorigin{10mm}{10mm} % origine des positions
......@@ -84,6 +85,9 @@
\newcommand{\eg}{e.g.}
\newcommand{\todo}[1]{{\color{red} @@ #1 @@}}
%\DeclareMathOperator*{\argmax}{argmax}
\newcommand{\argmaxx}{\operatornamewithlimits{argmax}}
\newcommand{\argmax}{ \ds \argmaxx}
......@@ -100,5 +104,9 @@
\newfontfamily\DejaSans{DejaVu Sans}
\newcommand{\annot}[2]{[#1]$_{#2}$}
%\newcommand{\annot}[2]{#1\_#2}
\graphicspath{{../}{../figures/}{./figures/}}
%{../figures/figures.pivot}{../figures/figures.dataselection}}
......@@ -31,12 +31,26 @@
% My highlight
%\newcommand{\myhl}[2]{\colorbox{#1}{\strut #2}} %usage \myhl{color}{text}
\newcommand{\myhl}[2]{%
\begingroup\setlength{\fboxsep}{1pt}%
%\newcommand<>\hlbox[2]{\only#3{\colorbox{#1}{#2}}}
\newcommand<>{\myhl}[2]{%
\alt#3{\begingroup\setlength{\fboxsep}{0pt}%
\colorbox{#1}{\vphantom{Ay}#2}%
\endgroup
\endgroup%
}{\begingroup\setlength{\fboxsep}{0pt}%
\colorbox{white}{\vphantom{Ay}#2}%
\endgroup}
}
% \colorbox{#1}{\hspace*{2pt}\vphantom{Ay}#2\hspace*{2pt}}%
%\newcommand<>{\myhlol}[2]{%
% \only#3{ \begingroup\setlength{\fboxsep}{1pt}%
% \colorbox{#1}{\vphantom{Ay}#2}%
% \endgroup
% }
%}
\setbeamercolor{alerted text}{fg=cyan}
......@@ -6,7 +6,8 @@
\newcommand{\blue}[1]{{\color{blue} #1}}
\newcommand{\cyan}[1]{{\color{cyan} #1}}
\newcommand{\orange}[1]{{\color{orange} #1}}
\newcommand{\green}[1]{{\color{darkpastelgreen} #1}}
\newcommand{\gray}[1]{{\color{gray} #1}}
\newcommand{\edinblue}[1]{{\color{edinblue} #1}}
\newcommand{\edinred}[1]{{\color{edinred} #1}}
......@@ -23,7 +24,6 @@
\definecolor{liumlightgray}{rgb}{0.9,0.9,0.9}
\newcommand{\liumlightgray}[1]{{\color{liumlightgray} #1}}
\newcommand{\myemph}[1]{{\liumcyan{\bf #1}}}
\newcommand{\myemph}[1]{\liumcyan{\textbf{#1}}}
\newcommand{\green}[1]{{\color{darkpastelgreen} #1}}
\ No newline at end of file
% !TEX root = text_processing.tex
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{}
\vfill
\centering
\Huge{\edinred{[Information Extraction]\\Introduction}}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{}
\begin{itemize}
\item \textbf{Introduction to Information Extraction}
\begin{itemize}
\item \textbf{Definition + contrast with IR}
\item \textbf{Example Applications}
\item \textbf{Overview of Tasks}
\item \textbf{Overview of Approaches}
\item \textbf{Evaluation + Shared Task Challenges}
\item \textbf{A brief history of IE}
\end{itemize}
\item Named Entity Recognition
\begin{itemize}
\item Task
\item Approaches: Rule-based, Supervised Learning
\item Entity Linking
\end{itemize}
\item Relation Extraction
\begin{itemize}
\item Task
\item Approaches: Rule-based, Supervised Learning, Bootstrapping, Distant Supervision
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction}
\begin{block}{Definition}
\myemph{Information Extraction} is the task of \textbf{identifying information} about predefined classes of \textbf{entitites}, \textbf{relationships} or \textbf{events} and record it in a \textbf{structured form}
\end{block}
\vfill
\textbf{Other definitions:}
\begin{itemize}
\item The activity of populating a structured information repository from an unstructured information source
\item The activity of creating a semantically annotated text collection ~(\ra\ \myemph{semantic web})
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction}
Why doing \myemph{information extraction}?
\begin{itemize}
\item searching or analysis using conventional database queries
\begin{itemize}
\item[\ra] Difficult to search directly in the text because it lacks structure.
\end{itemize}
\item data mining
\item summarisation (eventually in another language)
\item construct indexes into/within/between large quantity of text
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction}
\begin{itemize}
\item Entities: persons, organisations, locations, times, etc. \ra\ \myemph{Named Entities}
\item Relationships: links between entities, etc.
\item Events: succession events, etc.
\end{itemize}
\vfill
The \textbf{structured form} can be implemented either as a database or form (slot filling) or by using XML tags (tagging)
\vfill
\todo{ADD A GRAPHIC WITH OBAMA, events = president start, end... WIFE = michelle etc...}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Example}
{\centering
\fbox{
\parbox{.9\textwidth}{
Who’s News: @ \myhl<3->{cyan!40}{Burns Fry Ltd.}\\
\myhl<5->{orange}{04/13/94} WALL STREET JOURNAL (J), PAGE B10 \textlangle CO\textrangle\\
\myhl<3->{cyan!40}{BURNS FRY Ltd.} (\myhl<4->{green!30}{Toronto}) – \myhl<2->{brown!90}{Donald Wright}, 46 years old, \myhl<7->{blue!20}{was named} \myhl<6->{red!40}{executive vice president} and \myhl<6->{red!40}{director of fixed income} at this brokerage firm. \myhl<2->{brown!90}{Mr. Wright} \myhl<7->{blue!20}{resigned} as \myhl<6->{red!40}{president} of \myhl<3->{cyan!40}{Merrill Lynch Canada Inc.}, a unit of \myhl<3->{cyan!40}{Merrill Lynch \& Co.}, to succeed \myhl<2->{brown!90}{Mark Kassirer}, 48, who left \myhl<3->{cyan!40}{Burns Fry} \myhl<5->{orange}{last month}. A \myhl<3->{cyan!40}{Merrill Lynch} \myhl<6->{red!40}{spokeswoman} said it hasn’t named a successor to \myhl<2->{brown!90}{Mr. Wright}, who is expected to begin his new position \myhl<5->{orange}{by the end of the month}.
}}
}
\begin{columns}
\begin{column}{.5\textwidth}
\begin{itemize}
\item \myhl<2->{brown!90}{persons}
\item \myhl<3->{cyan!40}{organisations}
\item \myhl<4->{green!30}{locations}
\end{itemize}
\end{column}
\begin{column}{.5\textwidth}
\begin{itemize}
\item \myhl<5->{orange}{times}
\item \myhl<6->{red!40}{position in a company}
\item \myhl<7->{blue!20}{succession events}
\end{itemize}
\end{column}
\end{columns}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Filled template}
\begin{center}
\includegraphics[width=0.8\textwidth]{ie_template}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction vs. Information Retrieval}
\textbf{IR Task:}
\begin{itemize}
\item Given a document collection and a user query
\item Returns a (ranked) list of documents relevant to the user query
\end{itemize}
\textbf{Strengths:}
\begin{itemize}
\item Can search huge document collections very rapidly
\item Insensitive to genre and domain of the texts
\item Relatively straightforward to implement
\begin{itemize}
\item challenges scaling to huge, dynamic document collections, e.g. the web
\end{itemize}
\end{itemize}
\textbf{Weaknesses}
\begin{itemize}
\item Documents are returned rather than information/answers
\begin{itemize}
\item user must further read texts to extract information
\item output is unstructured so limited possibilities for further processing
\end{itemize}
\item Frequently not discriminating enough (“14,100,000 documents match your request”)
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction vs. Information Retrieval}
\textbf{IE Task:}
\begin{itemize}
\item Given a document collection and a predefined set of entities, relations and/or events
\item Returns a structured representation of all mentions of the specified entities, relations and/or events
\end{itemize}
\textbf{Strengths:}
\begin{itemize}
\item Extracts \textbf{facts} from texts, not just texts from text collections
\item Can feed other powerful applications (databases, semantic indexing engines, data mining tools)
\end{itemize}
\textbf{Weaknesses:}
\begin{itemize}
\item Systems tend to be genre/domain specific and porting to new genres and domains can be time-consuming/requires expertise
\item Limited accuracy
\item Computationally demanding, so performance issues on very large collections
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: example applications}
\begin{itemize}
\item Google News uses Named Entity Recognition for its “In the News” feature
\item Scrapping web pages to build structured databases of job postings, apartment rentals, seminar announcements, etc.
\item Assisting biomedical database curators by extracting biomedical entities and relations from the scientific literature prior to entry in a human-maintained database (e.g. Flybase)
\item Assisting companies in competitor intelligence gathering, e.g. management or researcher succession events, new product or project annoucements, etc.
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Overview}
\begin{itemize}
\item \textbf{Introduction to Information Extraction}
\begin{itemize}
\item \gray{Definition + contrast with IR}
\item \gray{Example Applications}
\item \textbf{Overview of Tasks}
\item Overview of Approaches
\item Evaluation + Shared Task Challenges
\item A brief history of IE
\end{itemize}
\item Named Entity Recognition
\begin{itemize}
\item Task
\item Approaches: Rule-based, Supervised Learning
\item Entity Linking
\end{itemize}
\item Relation Extraction
\begin{itemize}
\item Task
\item Approaches: Rule-based, Supervised Learning, Bootstrapping, Distant Supervision
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Overview of tasks}
\begin{block}{\textbf{Entity Extraction/Named Entity Recognition (NER)}}
Task: Identify the \myemph{extent} and the \myemph{type} of each textual mention of an entity\\
The set of types is determined in advance (e.g. organisation, person, date, etc...)
\end{block}
\begin{center}
\begin{tabular}{ll}
\myhl{cyan!40}{Cable and Wireless} today announced \ldots & Extent: 0-3 ; Type = \myhl{cyan!40}{ORG} \\
\myhl{cyan!40}{IBM} and \myhl{cyan!40}{Microsoft} today announced \ldots & Extent: 0-1 ; Type = \myhl{cyan!40}{ORG} \\
& Extent: 2-3 ; Type = \myhl{cyan!40}{ORG} \\
\myhl{brown!90}{John Lewis} hired \ldots & Extent: 0-2 ; Type = \myhl{cyan!40}{ORG} \\
\myhl{brown!90}{Theresa May} hired. & Extent: 0-2 ; Type = \myhl{brown!90}{PER}
\end{tabular}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Entity extraction}
\textbf{Types of entities addressed by IE systems include:}\\
\begin{itemize}
\item \textbf{Named individuals}
\begin{itemize}
\item Organisations (ORG), persons (PER), books, films, ships, restaurants . . .
\item[\ra] \myhl{cyan!40}{Cable and Wireless} today announced \ldots ; Extent: \textbf{0-3} ; Type = \textbf{ORG} \\
\item[\ra] \myhl{brown!90}{Barack Obama} was the 44th president... \ldots ; Extent: \textbf{0-3} ; Type = \textbf{PER} \\
\item Geo-Political entities (GPE), locations (LOC)
\item[\ra] The \myhl{carminered}{Mont Blanc} intersects France, Italy and Switzerland. ; Extent: \textbf{1-3} ; Type = \textbf{LOC} \\
\item[\ra] The Mont Blanc intersects \myhl{carminered!60}{France}, \myhl{carminered!60}{Italy} and \myhl{carminered!60}{Switzerland}. ; Extent: \textbf{4-5} ; Type = \textbf{GPE} \\
\end{itemize}
%\item Named kinds
%\begin{itemize}
%\item Proteins, chemical compounds/drugs, diseases, aircraft components . . .
%\end{itemize}
\item \textbf{Times}: temporal expressions dates, times of day
\begin{itemize}
\item[\ra] Let's meet at \myhl{orange}{2pm} next Friday \ldots ; Extent: \textbf{3-4} ; Type = \textbf{TIME} \\
\item[\ra] Let's meet at 2pm next \myhl{orange!50}{Friday} \ldots ; Extent: \textbf{5-6} ; Type = \textbf{DATE} \\
\end{itemize}
\item \textbf{Measures}: monetary expressions, distances/sizes, weights . . .
\begin{itemize}
\item[\ra] This watch costs \myhl{bananayellow}{£35} \ldots ; Extent: \textbf{3-4} ; Type = \textbf{MONEY} \\
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Entity extraction: coreference}
\begin{block}{\textbf{Coreference}}
Different textual expressions that refer to the same real world entity are said to \myemph{corefer}.
\textbf{Coreference Task}: link together all textual references to the same \myemph{real world entity},
\end{block}
Multiple references to the same entity in a text are rarely made using the same string:
\begin{itemize}
\item Pronouns: \textbf{Tony Blair} == \textbf{he}
\item Names/definite descriptions: \textbf{Tony Blair} == \textbf{the Prime Minister}
\item Abbreviated forms: \textbf{Theresa May} == \textbf{May}; \textbf{European Union} == \textbf{EU}
\item Orthographic variants: \textbf{alpha helix} == \textbf{alpha-helix} == \textbf{$\bm{\alpha}$-helix} == \textbf{a-helix}
\end{itemize}
\vfill
Can be seen as a separate task or as part of entity extraction task
\vfill
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation extraction}
\begin{block}{\textbf{Relation Extraction}}
Identify all assertions of relations between entities %usually binary
\end{block}
May be divided into two subtasks:
\begin{itemize}
\item \textbf{Relation detection}: find pairs of entities between which a relation holds
\item \textbf{Relation classification}: determine the type of a previously extracted relation
\end{itemize}
\vfill
\only<2>{
Example:
{\sc location\_of} holding between
\begin{itemize}
\item[] \begin{itemize}
\item {\sc organisation} and {\sc geopolotical\_location}
\item medical {\sc investigation} and {\sc body\_part}
\item {\sc gene} and {\sc chromosome\_location}
\end{itemize}
\item {\sc employee\-of} holding between {\sc person} and {\sc organisation}
\item {\sc product\_of} holding between {\sc artifact} and {\sc organisation}
%\item {\sc interaction} holding between {\sc protein} and {\sc protein}
\end{itemize}
}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Relation extraction}
\textbf{Challenges for Relation Extraction}
\begin{enumerate}
\item<1-> Many different ways to express the same relation
\begin{itemize}
\item \myemph{Canonical}: \annot{Microsoft}{ORG} \textbf{\underline{is located in}} \annot{Redmond}{LOC}
\item \myemph{Synonyms}: \annot{Microsoft}{ORG} \textbf{\underline{is located/based/headquartered in}} \annot{Redmond}{LOC}
\item \myemph{Syntactic variations}:
\begin{itemize}
\item \annot{Microsoft}{ORG}, the software giant and ... , \textbf{\underline{is based in}} \annot{Redmond}{LOC}
\item \annot{Redmond}{LOC}\textbf{\underline{-based}} \annot{Microsoft}{ORG} ...
\item \annot{Redmond}{LOC}\textbf{\underline{'s}} \annot{Microsoft}{ORG} ...
\item \annot{Redmond}{LOC} software giant \annot{Microsoft}{ORG} ...
\end{itemize}
\end{itemize}
\item<2-> Relations often involve coreference links\\
\small{
\myemph{\annot{Bill Gates}{PER}} \textbf{\underline{is co-founder, technology advisor and board member}} of \annot{Microsoft}{ORG}. \myemph{\annot{He}{PER}} \textbf{\underline{served as chairman of the board}} until Feb. 4, 2014. On June 27, 2008, \myemph{\annot{Gates}{PER}} \textbf{\underline{transitioned out}} of a day-to-day role in the company to spend more time on his global health and education work at the \annot{Bill \& Melinda Gates Foundation}{ORG}.
}
\end{enumerate}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Event extraction}
\begin{block}{\textbf{Relation Extraction}}
Identify all reports of event instances, typically of a small set of classes
\end{block}
May be divided into two subtasks:
\begin{itemize}
\item \textbf{Event detection}: find all mentions of events in a text
\item \textbf{Event classification}: assign a class to the detected events
\end{itemize}
\vfill
Examples
\begin{itemize}
\item National/european elections
\item Management succession events
\item Joint venture/product announcements
\item Terrorist attacks
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Event extraction}
\textbf{Challenges for Event Extraction}\\
Events may be simply viewed as relations. However they are typically complex relations\\
\vfill
\begin{itemize}
\item<1-> often temporally situated, often short duration
\item[\ra]<1-> \small{Bolt etched his name in history with a 9.69-second finish \textbf{on Aug. 16, 2008}.}\\
\item[]<2->
\item<2-> often involve multiple role players (often >2)
\item[\ra]<2-> \small{23 September 2019. \textbf{\underline{Banks, businesses, civil society and governments}} at all levels are to announce initiatives to finance and build a new generation of sustainable cities at the \textbf{UN Climate Action Summit} in New York today.}\\
\item[]<3->
\item<3-> often expressed across multiple sentences
\end{itemize}
\vfill
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Overview}
\begin{itemize}
\item \textbf{Introduction to Information Extraction}
\begin{itemize}
\item \gray{Definition + contrast with IR}
\item \gray{Example Applications}
\item \gray{Overview of Tasks}
\item \textbf{Overview of Approaches}
\item Evaluation + Shared Task Challenges
\item A brief history of IE
\end{itemize}
\item Named Entity Recognition
\begin{itemize}
\item Task
\item Approaches: Rule-based, Supervised Learning
\item Entity Linking
\end{itemize}
\item Relation Extraction
\begin{itemize}
\item Task
\item Approaches: Rule-based, Supervised Learning, Bootstrapping, Distant Supervision
\end{itemize}
\end{itemize}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Overview of Approaches}
\vfill
Approaches to IE may be placed into four categories:
\begin{enumerate}
\item Knowledge Engineering Approaches
\item Supervised Learning Approaches
\item Bootstrapping Approaches
\item Distant Supervision Approaches
\end{enumerate}
\vfill
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Knowledge Engineering Approaches}
\colorbox{liumlightgray}{
\parbox{.99\textwidth}{
\myhl{brown!90}{\annot{Mr. \tikzmark{a} Wright}{PER}}, \textbf{\underline{\annot{executive vice president}{POSITION}}} of \myhl{cyan!40}{\annot{Merrill Lynch \tikzmark{b} Canada Inc.}{ORG}}\\
\begin{tikzpicture}[overlay,remember picture]
\draw [very thick, color=carminered] ($({pic cs:a})+(0ex,-1ex)$) -- ($({pic cs:a})+(0ex,-3ex)$);
\draw [very thick, color=carminered] ($({pic cs:a})+(0ex,-3ex)$) -- ($({pic cs:b})+(0ex,-3ex)$) node [midway, below, color=carminered] {is-employed-by};
\draw [very thick, color=carminered] ($({pic cs:b})+(0ex,-3ex)$) -- ($({pic cs:b})+(0ex,-1ex)$);
\end{tikzpicture}
}}
\vfill
Use manually authored rules and can be divided into
\begin{itemize}
\item “deep” – linguistically inspired language understanding systems
\item “shallow” – systems engineered to the IE task, typically using pattern-action rules
\end{itemize}
\begin{center}
\begin{tabular}{ll}
Pattern: &\textbf{‘Mr. \$Uppercase-initial-word’’} \\
Action: & \textbf{add-entity(person(Mr. \$Uppercase-initial-word))} \\
& \\
Pattern: & \textbf{"\$Person, \$Position of \$Organization"} \\
Action: & \textbf{add-relation(is-employed-by(\$Person,\$Organization))} \\
\end{tabular}
\end{center}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{frame}
\frametitle{Information Extraction: Supervised Learning Approaches}
\Ra\ Machine Learning systems trained on manually annotated texts (entities and relations)\\
\colorbox{liumlightgray}{
\parbox{.99\textwidth}{
\myhl{brown!90}{\annot{Mr. \tikzmark{a2} Wright}{PER}}, \textbf{\underline{\annot{executive vice president}{POSITION}}} of \myhl{cyan!40}{\annot{Merrill Lynch \tikzmark{b2} Canada Inc.}{ORG}}\\
\begin{tikzpicture}[overlay,remember picture]
\draw [very thick, color=carminered] ($({pic cs:a2})+(0ex,-1ex)$) -- ($({pic cs:a2})+(0ex,-3ex)$);
\draw [very thick, color=carminered] ($({pic cs:a2})+(0ex,-3ex)$) -- ($({pic cs:b2})+(0ex,-3ex)$) node [midway, below, color=carminered] {is-employed-by};
\draw [very thick, color=carminered] ($({pic cs:b2})+(0ex,-3ex)$) -- ($({pic cs:b2})+(0ex,-1ex)$);
\end{tikzpicture}
}}
\textbf{For each entity/relation create a training instance}
\begin{itemize}
\item $k$ words either side of an entity mention
\item $k$ words to the left of entity 1 and to the right of entity 2 plus the words in between
\item[\ra] extract \myemph{features}: words, POS, morphology
\end{itemize}
\textbf{Systems may learn}
\begin{itemize}