@article{b,
title = {A data compression approach to monolingual GIRT task: an agnostic point of view},
author = {Daniela Alderduccio and Luciana Bordoni and Vittorio Loreto},
url = {http://www.springerlink.com/content/rclr22d4636dgpb0/},
year = {2004},
date = {2004-01-01},
journal = {LECTURE NOTES IN COMPUTER SCIENCE},
volume = {3237},
pages = {391--400},
publisher = {C. Peters (Ed.) (Springer-Verlag)},
abstract = {In this paper we apply a data-compression IR method in the GIRT social science database, focusing on the monolingual task in German and English. For this purpose we use a recently proposed general scheme for context recognition and context classification of strings of characters (in particular texts) or other coded information. The key point of the method is the computation of a suitable measure of remoteness (or similarity) between two strings of characters. This measure of remoteness reflects the distance between the structures present in the two strings, i.e. between the two different distributions of elements of the compared sequences. The hypothesis is that the information-theory oriented measure of remoteness between two sequences could reflect their semantic distance. It is worth stressing the generality and versatility of our information-theoretic method which applies to any kind of corpora of character strings, whatever the type of coding used (i.e. language).},
keywords = {complexity, data_compression, information_theory, loreto, zippers},
pubstate = {published},
tppubtype = {article}
}
In this paper we apply a data-compression IR method in the GIRT social science database, focusing on the monolingual task in German and English. For this purpose we use a recently proposed general scheme for context recognition and context classification of strings of characters (in particular texts) or other coded information. The key point of the method is the computation of a suitable measure of remoteness (or similarity) between two strings of characters. This measure of remoteness reflects the distance between the structures present in the two strings, i.e. between the two different distributions of elements of the compared sequences. The hypothesis is that the information-theory oriented measure of remoteness between two sequences could reflect their semantic distance. It is worth stressing the generality and versatility of our information-theoretic method which applies to any kind of corpora of character strings, whatever the type of coding used (i.e. language).
@article{bcl2002,
title = {Language trees and zipping},
author = {Dario Benedetto and Emanuele Caglioti and Vittorio Loreto,},
url = {http://samarcanda.phys.uniroma1.it/vittorioloreto/PAPERS/2002/Benedetto_PhysRevLett_2002.pdf},
year = {2002},
date = {2002-01-01},
journal = {Physical Review Letters},
volume = {88},
pages = {048702},
abstract = {In this Letter we present a very general method for extracting information from a generic string of
characters, e.g., a text, a DNA sequence, or a time series. Based on data-compression techniques, its
key point is the computation of a suitable measure of the remoteness of two bodies of knowledge. We
present the implementation of the method to linguistic motivated problems, featuring highly accurate
results for language recognition, authorship attribution, and language classification.},
keywords = {complexity, data_compression, information_theory, loreto, zippers},
pubstate = {published},
tppubtype = {article}
}
In this Letter we present a very general method for extracting information from a generic string of
characters, e.g., a text, a DNA sequence, or a time series. Based on data-compression techniques, its
key point is the computation of a suitable measure of the remoteness of two bodies of knowledge. We
present the implementation of the method to linguistic motivated problems, featuring highly accurate
results for language recognition, authorship attribution, and language classification.