@ARTICLE{10.21494/ISTE.OP.2020.0463, TITLE={Detection of weak signals in weakly structured data masses}, AUTHOR={Julien Maitre, Michel Menard, Guillaume Chiron, Alain Bouju, }, JOURNAL={Information Retrieval, Document and Semantic Web}, VOLUME={3}, NUMBER={Issue 1}, YEAR={2020}, URL={http://www.openscience.fr/Detection-of-weak-signals-in-weakly-structured-data-masses}, DOI={10.21494/ISTE.OP.2020.0463}, ISSN={2516-3280}, ABSTRACT={This paper is related to a project aiming at discovering weak signals from different streams of information, possibly sent by whistleblowers in a platform as GlobalLeaks. The study presented in this paper tackles the particular problem of clustering topics at multi-levels from multiple documents, and then extracting meaningful descriptors, such as weighted lists of words for document representations in a multi-dimensions space. In this context, we present a novel idea which combines Latent Dirichlet Allocation and Word2Vec (providing a consistency metric regarding the partitioned topics) as potential method for limiting the “a priori” number of cluster k usually needed in classical partitioning approaches. We proposed 2 implementations of this idea, respectively able to : (1) finding the best k for LDA in terms of topic consistency ; (2) gathering the optimal clusters from different levels of clustering. We also proposed a non-traditional visualization approach based on a multi-agents system which combines both dimension reduction and interactivity.}}