Veksler, Vladislav Daniel; Govostes, Ryan Z; Gray, Wayne D
Defining the dimensions of the human semantic space Incollection
Sloutsky, Vladimir ; Love, Brad ; McRae, Ken (Ed.): 30th Annual Meeting of the Cognitive Science Society, pp. 1282-1287, Cognitive Science Society, Austin, TX, 2008.
@incollection{vdv08csc.paper,
title = {Defining the dimensions of the human semantic space},
author = { Vladislav Daniel Veksler and Ryan Z. Govostes and Wayne D. Gray},
editor = {Sloutsky, Vladimir and Love, Brad and McRae, Ken},
year = {2008},
date = {2008-01-01},
booktitle = {30th Annual Meeting of the Cognitive Science Society},
pages = {1282-1287},
publisher = {Cognitive Science Society},
address = {Austin, TX},
abstract = {We describe VGEM, a technique for converting probability- based measures of semantic relatedness (e.g. Normalized Google Distance, Pointwise Mutual Information) into a vector-based form to allow these measures to evaluate relatedness of multi-word terms (documents, paragraphs). We use a genetic algorithm to derive a set of 300 dimensions to represent the human semantic space. With the resulting dimension sets, VGEM matches or outperforms the probability-based measure, while adding the multi-word term functionality. We test VGEM's performance on multi-word terms against Latent Semantic Analysis and find no significant difference between the two measures. We conclude that VGEM is more useful than probability-based measures because it affords better performance, and provides relatedness between multi-word terms; and that VGEM is more useful than other vector-based measures because it is more computationally feasible for large, dynamic corpora (e.g. WWW), and thus affords a larger, dynamic lexicon.},
keywords = {computational linguistics, Latent Semantic Analysis, LSA, Measures of Semantic Relatedness, multidimensional semantic space, natural language processing, NGD, Normalized Google Distance, semantic dimensions, vector generation, VGEM},
pubstate = {published},
tppubtype = {incollection}
}
We describe VGEM, a technique for converting probability- based measures of semantic relatedness (e.g. Normalized Google Distance, Pointwise Mutual Information) into a vector-based form to allow these measures to evaluate relatedness of multi-word terms (documents, paragraphs). We use a genetic algorithm to derive a set of 300 dimensions to represent the human semantic space. With the resulting dimension sets, VGEM matches or outperforms the probability-based measure, while adding the multi-word term functionality. We test VGEM's performance on multi-word terms against Latent Semantic Analysis and find no significant difference between the two measures. We conclude that VGEM is more useful than probability-based measures because it affords better performance, and provides relatedness between multi-word terms; and that VGEM is more useful than other vector-based measures because it is more computationally feasible for large, dynamic corpora (e.g. WWW), and thus affords a larger, dynamic lexicon.
@inproceedings{lindsey07iccm,
title = {Be wary of what your computer reads: The effects of corpus selection on measuring semantic relatedness},
author = { Robert Lindsey and Vladislav Daniel Veksler and Alex Grintsvayg and Wayne D. Gray},
doi = {10.13140/2.1.2295.1206},
crossref = {conf:iccm07},
abstract = {Measures of Semantic Relatedness (MSRs) provide models of human semantic associations and, as such, have been applied to predict human text comprehension (Lemaire, Denhiere, Bellissens, & Jhean-Iarose, 2006). In addition, MSRs form key components in more integrated cognitive modeling such as models that perform information search on the World Wide Web (WWW) (Pirolli, 2005). However, the effectiveness of an MSR depends on the algorithm it uses as well as the text corpus on which it is trained. In this paper, we examine the impact of corpus selection on the performance of two popular MSRs, Pointwise Mutual Information and Normalised Google Distance. We tested these measures with corpora derived from the WWW, books, news articles, emails, web-forums, and encyclopedia. Results indicate that for the tested MSRs, the traditionally employed books and WWW-based corpora are less than optimal, and that using a corpus based on the New York Times news articles best predicts human behavior.},
keywords = {computational linguistics, corpus comparison, Measures of Semantic Relatedness, natural language processing, NGD, Normalised Google Distance, PMI, Pointwise Mutual Information, semantic similarity, training corpus},
pubstate = {published},
tppubtype = {inproceedings}
}
Measures of Semantic Relatedness (MSRs) provide models of human semantic associations and, as such, have been applied to predict human text comprehension (Lemaire, Denhiere, Bellissens, & Jhean-Iarose, 2006). In addition, MSRs form key components in more integrated cognitive modeling such as models that perform information search on the World Wide Web (WWW) (Pirolli, 2005). However, the effectiveness of an MSR depends on the algorithm it uses as well as the text corpus on which it is trained. In this paper, we examine the impact of corpus selection on the performance of two popular MSRs, Pointwise Mutual Information and Normalised Google Distance. We tested these measures with corpora derived from the WWW, books, news articles, emails, web-forums, and encyclopedia. Results indicate that for the tested MSRs, the traditionally employed books and WWW-based corpora are less than optimal, and that using a corpus based on the New York Times news articles best predicts human behavior.