abstract={Named entity recognition (NER) is the task to identify text spans that mention named entities, and to classify them into predefined categories such as person, location, organization etc. NER serves as the basis for a variety of natural language applications such as question answering, text summarization, and machine translation. Although early NER systems are successful in producing decent recognition accuracy, they often require much human effort in carefully designing rules or features. In recent years, deep learning, empowered by continuous real-valued vector representations and semantic composition through nonlinear processing, has been employed in NER systems, yielding stat-of-the-art performance. In this paper, we provide a comprehensive review on existing deep learning techniques for NER. We first introduce NER resources, including tagged NER corpora and off-the-shelf NER tools. Then, we systematically categorize existing works based on a taxonomy along three axes: distributed representations for input, context encoder, and tag decoder. Next, we survey the most representative methods for recent applied techniques of deep learning in new NER problem settings and applications. Finally, we present readers with the challenges faced by NER systems and outline future directions in this area.},

archivePrefix={arXiv},

arxivId={1812.09449},

author={Li, Jing and Sun, Aixin and Han, Jianglei and Li, Chenliang},

eprint={1812.09449},

file={:Users/loicbarrault/Library/Application Support/Mendeley Desktop/Downloaded/Li et al. - 2018 - A Survey on Deep Learning for Named Entity Recognition.pdf:pdf},

mendeley-groups={NER},

month={dec},

title={{A Survey on Deep Learning for Named Entity Recognition}},

url={http://arxiv.org/abs/1812.09449},

year={2018}

}

@inproceedings{Vaswani2017,

abstract={The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.0 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature.},

archivePrefix={arXiv},

arxivId={1706.03762},

author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, {\L}ukasz and Polosukhin, Illia},

booktitle={Advances in Neural Information Processing Systems},

eprint={1706.03762},

issn={10495258},

mendeley-groups={LanguageModelling},

title={{Attention is all you need}},

year={2017}

}

@article{Hochreiter1997,

abstract={Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient-based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O(1). Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms.},

author={Hochreiter, Sepp and Schmidhuber, J{\"{u}}rgen},

doi={10.1162/neco.1997.9.8.1735},

issn={08997667},

journal={Neural Computation},

mendeley-groups={ML},

pmid={9377276},

title={{Long Short-Term Memory}},

year={1997}

}

@inproceedings{Cho2014,

abstract={In this paper, we propose a novel neural network model called RNN Encoder- Decoder that consists of two recurrent neural networks (RNN). One RNN encodes a sequence of symbols into a fixedlength vector representation, and the other decodes the representation into another sequence of symbols. The encoder and decoder of the proposed model are jointly trained to maximize the conditional probability of a target sequence given a source sequence. The performance of a statistical machine translation system is empirically found to improve by using the conditional probabilities of phrase pairs computed by the RNN Encoder-Decoder as an additional feature in the existing log-linear model. Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases.},

archivePrefix={arXiv},

arxivId={1406.1078},

author={Cho, Kyunghyun and {Van Merri{\"{e}}nboer}, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua},

booktitle={EMNLP 2014 - 2014 Conference on Empirical Methods in Natural Language Processing, Proceedings of the Conference},

doi={10.3115/v1/d14-1179},

eprint={1406.1078},

isbn={9781937284961},

mendeley-groups={NMT},

title={{Learning phrase representations using RNN encoder-decoder for statistical machine translation}},

\myemph{Project} or represent the \textbf{text} into a \myemph{continuous space} and train an estimator operating into this space to compute the probability of the sentiment.

\myemph{Project} or represent the \textbf{text} into a \myemph{continuous space} and train an estimator operating into this space to compute the probability of the sentiment.

\end{block}

\begin{itemize}

\item Special recurrent cells addressing the problem of \textbf{vanishing gradient}

\begin{itemize}

\item LSTM: Long Short-Term Memory \cite{Hochreiter1997}

\item GRU: Gated Recurrent Unit \cite{Cho2014}

\end{itemize}

\vfill

\item Transformer models \ra\ Vaswani et al. 2017, Attention is All you Need \cite{Vaswani2017}

\item[\Ra] All subsequent BERT models, see e.g. \url{https://nlp.stanford.edu/seminar/details/jdevlin.pdf}