@ARTICLE{10.21494/ISTE.OP.2019.0335, TITLE={Automatic analysis of old documents: taking advantage of an incomplete, heterogeneous and noisy corpus}, AUTHOR={Karine Abiven, Gaƫl Lejeune, }, JOURNAL={Information Retrieval, Document and Semantic Web}, VOLUME={2}, NUMBER={Issue 1}, YEAR={2019}, URL={https://www.openscience.fr/Automatic-analysis-of-old-documents-taking-advantage-of-an-incomplete}, DOI={10.21494/ISTE.OP.2019.0335}, ISSN={2516-3280}, ABSTRACT={In this article we try to tackle some problems arising with noisy and heterogeneous data in the domain of digital humanities. We investigate a corpus known as the mazarinades corpus which gathers around 5,500 documents in French from the 17th century. First of all, we show that this set of documents is not strictly speaking a corpus since its coverage has not been thoroughly defined. Then, we advocate that it is possible to get interesting results even in the case of such an incomplete, heterogeneous and noisy dataset by strictly limiting the amount of pre-treatments necessary fro processing texts. Finally, we present some results on a case study on document dating where we aim to complete missing metadata in the mazarinades corpus. We exploit a method based on character strings analysis which is robust to noisy data and can even take advantage of this noise for improving the quality of the results.}}