Lindsey, Robert; Stipicevic, Michael; Veksler, Vladislav Daniel; Gray, Wayne D
BLOSSOM: Best path length on a semantic self-organizing map Incollection
Sloutsky, Vladimir ; Love, Brad ; McRae, Ken (Ed.): 30th Annual Meeting of the Cognitive Science Society, pp. 481-486, Cognitive Science Society, Austin, TX, 2008.
@incollection{lindsey08csc,
title = {BLOSSOM: Best path length on a semantic self-organizing map},
author = { Robert Lindsey and Michael Stipicevic and Vladislav Daniel Veksler and Wayne D. Gray},
editor = {Sloutsky, Vladimir and Love, Brad and McRae, Ken},
year = {2008},
date = {2008-01-01},
booktitle = {30th Annual Meeting of the Cognitive Science Society},
pages = {481-486},
publisher = {Cognitive Science Society},
address = {Austin, TX},
abstract = {We describe Vector Generation from Explicitly-defined Multidimensional semantic Space (VGEM), a method for converting a measure of semantic relatedness (MSR) into vector form. We also describe Best path Length on a Semantic Self-Organizing Map (BLOSSOM), a semantic relatedness technique employing VGEM and a connectionist, nonlinear dimensionality reduction technique. The psychological validity of BLOSSOM is evaluated using test cases from a large free-association norms dataset; we find that BLOSSOM consistently shows improvement over VGEM. BLOSSOM matches the performance of its base-MSR using a 21 dimensional vector-space and shows promise to outperform its base-MSR with a more rigorous exploration of the parameter space. In addition, BLOSSOM provides benefits such as document relatedness, concept-path formation, intuitive visualizations, and unsupervised text clustering.},
keywords = {BLOSSOM, computational linguistics, Dijkstra's algorithm, Measures of Semantic Relatedness, natural language processing, nonlinear dimensionality reduction, Self- Organizing Maps, SOM traversal, VGEM},
pubstate = {published},
tppubtype = {incollection}
}
We describe Vector Generation from Explicitly-defined Multidimensional semantic Space (VGEM), a method for converting a measure of semantic relatedness (MSR) into vector form. We also describe Best path Length on a Semantic Self-Organizing Map (BLOSSOM), a semantic relatedness technique employing VGEM and a connectionist, nonlinear dimensionality reduction technique. The psychological validity of BLOSSOM is evaluated using test cases from a large free-association norms dataset; we find that BLOSSOM consistently shows improvement over VGEM. BLOSSOM matches the performance of its base-MSR using a 21 dimensional vector-space and shows promise to outperform its base-MSR with a more rigorous exploration of the parameter space. In addition, BLOSSOM provides benefits such as document relatedness, concept-path formation, intuitive visualizations, and unsupervised text clustering.
Veksler, Vladislav Daniel; Govostes, Ryan Z; Gray, Wayne D
Defining the dimensions of the human semantic space Incollection
Sloutsky, Vladimir ; Love, Brad ; McRae, Ken (Ed.): 30th Annual Meeting of the Cognitive Science Society, pp. 1282-1287, Cognitive Science Society, Austin, TX, 2008.
@incollection{vdv08csc.paper,
title = {Defining the dimensions of the human semantic space},
author = { Vladislav Daniel Veksler and Ryan Z. Govostes and Wayne D. Gray},
editor = {Sloutsky, Vladimir and Love, Brad and McRae, Ken},
year = {2008},
date = {2008-01-01},
booktitle = {30th Annual Meeting of the Cognitive Science Society},
pages = {1282-1287},
publisher = {Cognitive Science Society},
address = {Austin, TX},
abstract = {We describe VGEM, a technique for converting probability- based measures of semantic relatedness (e.g. Normalized Google Distance, Pointwise Mutual Information) into a vector-based form to allow these measures to evaluate relatedness of multi-word terms (documents, paragraphs). We use a genetic algorithm to derive a set of 300 dimensions to represent the human semantic space. With the resulting dimension sets, VGEM matches or outperforms the probability-based measure, while adding the multi-word term functionality. We test VGEM's performance on multi-word terms against Latent Semantic Analysis and find no significant difference between the two measures. We conclude that VGEM is more useful than probability-based measures because it affords better performance, and provides relatedness between multi-word terms; and that VGEM is more useful than other vector-based measures because it is more computationally feasible for large, dynamic corpora (e.g. WWW), and thus affords a larger, dynamic lexicon.},
keywords = {computational linguistics, Latent Semantic Analysis, LSA, Measures of Semantic Relatedness, multidimensional semantic space, natural language processing, NGD, Normalized Google Distance, semantic dimensions, vector generation, VGEM},
pubstate = {published},
tppubtype = {incollection}
}
We describe VGEM, a technique for converting probability- based measures of semantic relatedness (e.g. Normalized Google Distance, Pointwise Mutual Information) into a vector-based form to allow these measures to evaluate relatedness of multi-word terms (documents, paragraphs). We use a genetic algorithm to derive a set of 300 dimensions to represent the human semantic space. With the resulting dimension sets, VGEM matches or outperforms the probability-based measure, while adding the multi-word term functionality. We test VGEM's performance on multi-word terms against Latent Semantic Analysis and find no significant difference between the two measures. We conclude that VGEM is more useful than probability-based measures because it affords better performance, and provides relatedness between multi-word terms; and that VGEM is more useful than other vector-based measures because it is more computationally feasible for large, dynamic corpora (e.g. WWW), and thus affords a larger, dynamic lexicon.
@inproceedings{lindsey07iccm,
title = {Be wary of what your computer reads: The effects of corpus selection on measuring semantic relatedness},
author = { Robert Lindsey and Vladislav Daniel Veksler and Alex Grintsvayg and Wayne D. Gray},
doi = {10.13140/2.1.2295.1206},
crossref = {conf:iccm07},
abstract = {Measures of Semantic Relatedness (MSRs) provide models of human semantic associations and, as such, have been applied to predict human text comprehension (Lemaire, Denhiere, Bellissens, & Jhean-Iarose, 2006). In addition, MSRs form key components in more integrated cognitive modeling such as models that perform information search on the World Wide Web (WWW) (Pirolli, 2005). However, the effectiveness of an MSR depends on the algorithm it uses as well as the text corpus on which it is trained. In this paper, we examine the impact of corpus selection on the performance of two popular MSRs, Pointwise Mutual Information and Normalised Google Distance. We tested these measures with corpora derived from the WWW, books, news articles, emails, web-forums, and encyclopedia. Results indicate that for the tested MSRs, the traditionally employed books and WWW-based corpora are less than optimal, and that using a corpus based on the New York Times news articles best predicts human behavior.},
keywords = {computational linguistics, corpus comparison, Measures of Semantic Relatedness, natural language processing, NGD, Normalised Google Distance, PMI, Pointwise Mutual Information, semantic similarity, training corpus},
pubstate = {published},
tppubtype = {inproceedings}
}
Measures of Semantic Relatedness (MSRs) provide models of human semantic associations and, as such, have been applied to predict human text comprehension (Lemaire, Denhiere, Bellissens, & Jhean-Iarose, 2006). In addition, MSRs form key components in more integrated cognitive modeling such as models that perform information search on the World Wide Web (WWW) (Pirolli, 2005). However, the effectiveness of an MSR depends on the algorithm it uses as well as the text corpus on which it is trained. In this paper, we examine the impact of corpus selection on the performance of two popular MSRs, Pointwise Mutual Information and Normalised Google Distance. We tested these measures with corpora derived from the WWW, books, news articles, emails, web-forums, and encyclopedia. Results indicate that for the tested MSRs, the traditionally employed books and WWW-based corpora are less than optimal, and that using a corpus based on the New York Times news articles best predicts human behavior.