admin 管理员组文章数量: 1086019
I'm trying to implement a piece of code on javascript to analyse word/frequency on a given string. My objective is to return a array as the following:
[{text: firstword, size:3 },{text:secondword , size:5 },{text: nword, size: 1},...]
I implemented the following code but I'm running out of memory, so I don't really know if its ok or not.
function wordFrequency(txt){
var wordArray = txt.split(/[ .?!,*'"]/);
var newArray = [];
$.each(wordArray, function (ix, word) {
if (newArray.length >= 1){
newArray.some(function (w){
if (w.text === word){
w.size++;
} else {
newArray.push({text: word, size: 1});
}
});
} else {
newArray.push({text: word, size: 1});
}
});
return newArray;
}
I'm trying to implement a piece of code on javascript to analyse word/frequency on a given string. My objective is to return a array as the following:
[{text: firstword, size:3 },{text:secondword , size:5 },{text: nword, size: 1},...]
I implemented the following code but I'm running out of memory, so I don't really know if its ok or not.
function wordFrequency(txt){
var wordArray = txt.split(/[ .?!,*'"]/);
var newArray = [];
$.each(wordArray, function (ix, word) {
if (newArray.length >= 1){
newArray.some(function (w){
if (w.text === word){
w.size++;
} else {
newArray.push({text: word, size: 1});
}
});
} else {
newArray.push({text: word, size: 1});
}
});
return newArray;
}
Share
Improve this question
edited Nov 12, 2014 at 0:13
diegodacal
asked Nov 12, 2014 at 0:06
diegodacaldiegodacal
971 silver badge11 bronze badges
7
- It's ok in my console – Nickool Commented Nov 12, 2014 at 0:17
- but it will have repeated result I tried this one: wordFrequency("negin!kjs$wkjh*df oiuw& skdjhh"); – Nickool Commented Nov 12, 2014 at 0:24
- I'm trying to use a big chunck of text and Im getting out of memory every time, even when tried with jsfiddle. :( – diegodacal Commented Nov 12, 2014 at 0:24
- because when you have space you shouldn't count it as a word – Nickool Commented Nov 12, 2014 at 0:25
- just remove the space and try again i mean in here : txt.split(/[ .?!,'"]/); remove and instead have this: txt.split(/[.?!,'"]/); – Nickool Commented Nov 12, 2014 at 0:28
5 Answers
Reset to default 2Array.prototype.some expects the given callback to return true or false and returns true as soon as your callback returns true for a given element, otherwise it returns false.
So some
iterates over all elements, with your given callback, and your callback checks if the given element text equals the search word and if not adds a new object. Introducing a new element the some
function can iterate over.
So to make this clear, for every word thats in the newArray
before the word you're searching, you're adding a new object containing your word.
Suppose your newArray
looks like this:
[{word:"test"},{word:"another"},{word:"one"},{word:"more"}]
after calling your function for the word even
it looks like this:
[{word:"test"},{word:"another"},{word:"one"},{word:"more"},{word:"even"},{word:"even"},{word:"even"},{word:"even"}]
Using Array.prototype.filter would be the better approach here, finding you the matching element, note that I also replaced $.each
with Array.prototype.forEach:
function wordFrequency(txt){
var wordArray = txt.split(/[ .?!,*'"]/);
var newArray = [], wordObj;
wordArray.forEach(function (word) {
wordObj = newArray.filter(function (w){
return w.text == word;
});
if (wordObj.length) {
wordObj[0].size += 1;
} else {
newArray.push({text: word, size: 1});
}
});
return newArray;
}
document.write(JSON.stringify(wordFrequency("count everything, count all the words, count all the words!").sort(function(a,b){return a.size<b.size})).split("},").join("}<br/>"));
It would be simpler and far more efficient to create a direct map from word to frequency, and only afterwards convert that to your array structure. Given an array words
create a map of the words:
var freq = words.reduce(function(p, c) {
p[c] = (p[c] || 0) + 1;
return p;
}, {});
and the convert that map into your array:
var array = Object.keys(freq).map(function(key) {
return { text: key, size: freq[key] };
});
To tell the frequency all you need is a hash map approach. Your algorithm is quadratic, since the some
method is nested in the each
method, so you're always looping over the newArray
just to find an entry and increment the size.
A map approach is easily achievable using a JavaScript object. It also gives you constant look-up time, which is better performance than the nested loops approach.
Try this approach instead:
function wordFrequency(txt){
var wordArray = txt.split(/[ .?!,*'"]/);
var map = {};
$.each(wordArray, function(ix, word) {
// skip empty results
if (!word.length) {
return;
}
// add word to map
if (!map[word]) {
map[word] = 0;
}
map[word]++;
});
return map;
}
To use the function:
var text = "hello!world*hello foo 'bar'foo";
var result = wordFrequency(text);
// iterate over results
Object.keys(result).forEach(function(w) {
console.log(w + ": " + result[w]);
});
// or use for...in
for (var w in result) {
console.log(w + ": " + result[w]);
}
If you really wanted to, you could then map the result into your desired array format with text and size properties:
var mappedResult = Object.keys(result).map(function(w) {
return { text: w, size: result[w] };
});
console.log(mappedResult);
Also, depending on your target browsers, you might consider using the array forEach
instead of the jQuery $.each
, similar to what I did with the Object.keys
portion.
Here's the JSBin example.
You would probably want to avoid any iterations on duplicate elements and keep your results array unique. Since any of the iterators of Array.prototype will include each of the elements, they might not be the ideal solution for this. Sometimes plain old loops do the job best ... (You may also want to expressively escape any special characters in your regular expression).
function wordFrequency(txt) {
var words = txt.split(/[ \.\?!,\*'"]+/),
seen = [];
for (var i = 0; i < words.length; i++) {
var w = words[i],
found = false;
for (var j = 0; j < seen.length; j++) {
if (w === seen[j].text) {
seen[j].size++;
found = true;
break;
}
}
if (!found) seen.push( { text: w, size: 1 } );
}
return seen;
}
(Note that the inner for-loop isn't visited for the first word, so the first word will be pushed to the seen-stack and the inner for-loop will start with the second word pared to the first one. Only words that we haven't seen already are added to the seen-stack, making it an array of unique elements.)
And here is the equivalent using Array.prototype.forEach() and Array.prototype.indexOf(), but we have to add another intermediate results stack for the latter one. So we'll have to add another iteration to produce the final result. (We wouldn't have to do this using Array.prototype.findIndex(), but this is not a standard method.)
function wordFrequency2(txt) {
var words = txt.split(/[ \.\?!,\*'"]+/),
seen = [],
freq = [];
// get frequencies
words.forEach(function (w) {
var idx = seen.indexOf(w);
if (idx >= 0) {
freq[idx]++;
}
else {
seen.push(w);
freq.push(1);
}
});
// produce the results array
var r = [];
seen.forEach(function (w, idx) {
r.push( { text: w, size: freq[idx] } );
});
return r;
}
Putting optimization into account, the first version using explicit loops will be probably performing faster ...
var words = (function(){
var sWords = document.body.innerText.toLowerCase().trim().replace(/[,;.]/g,'').split(/[\s\/]+/g).sort();
var iWordsCount = sWords.length; // count w/ duplicates
// array of words to ignore
var ignore = ['and','the','to','a','of','for','as','i','with','it','is','on','that','this','can','in','be','has','if'];
ignore = (function(){
var o = {}; // object prop checking > in array checking
var iCount = ignore.length;
for (var i=0;i<iCount;i++){
o[ignore[i]] = true;
}
return o;
}());
var counts = {}; // object for math
for (var i=0; i<iWordsCount; i++) {
var sWord = sWords[i];
if (!ignore[sWord]) {
counts[sWord] = counts[sWord] || 0;
counts[sWord]++;
}
}
var arr = []; // an array of objects to return
for (sWord in counts) {
arr.push({
text: sWord,
frequency: counts[sWord]
});
}
// sort array by descending frequency | http://stackoverflow./a/8837505
return arr.sort(function(a,b){
return (a.frequency > b.frequency) ? -1 : ((a.frequency < b.frequency) ? 1 : 0);
});
}());
(function(){
var iWordsCount = words.length; // count w/o duplicates
for (var i=0; i<iWordsCount; i++) {
var word = words[i];
console.log(word.frequency, word.text);
}
}());
本文标签: Word frequency for array of keyvalues on javascriptStack Overflow
版权声明:本文标题:Word frequency for array of keyvalues on javascript - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://roclinux.cn/p/1744099529a2533433.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论