admin 管理员组

文章数量: 1086019

I'm trying to implement a piece of code on javascript to analyse word/frequency on a given string. My objective is to return a array as the following:

[{text: firstword, size:3 },{text:secondword , size:5 },{text: nword, size: 1},...]

I implemented the following code but I'm running out of memory, so I don't really know if its ok or not.

function wordFrequency(txt){
    var wordArray = txt.split(/[ .?!,*'"]/);
    var newArray = [];
    $.each(wordArray, function (ix, word) {
        if (newArray.length >= 1){
            newArray.some(function (w){
                if (w.text === word){
                    w.size++;
                } else {
                    newArray.push({text: word, size: 1});
                }
            });
        } else {
            newArray.push({text: word, size: 1});
        }
    });
    return newArray;
}

I'm trying to implement a piece of code on javascript to analyse word/frequency on a given string. My objective is to return a array as the following:

[{text: firstword, size:3 },{text:secondword , size:5 },{text: nword, size: 1},...]

I implemented the following code but I'm running out of memory, so I don't really know if its ok or not.

function wordFrequency(txt){
    var wordArray = txt.split(/[ .?!,*'"]/);
    var newArray = [];
    $.each(wordArray, function (ix, word) {
        if (newArray.length >= 1){
            newArray.some(function (w){
                if (w.text === word){
                    w.size++;
                } else {
                    newArray.push({text: word, size: 1});
                }
            });
        } else {
            newArray.push({text: word, size: 1});
        }
    });
    return newArray;
}
Share Improve this question edited Nov 12, 2014 at 0:13 diegodacal asked Nov 12, 2014 at 0:06 diegodacaldiegodacal 971 silver badge11 bronze badges 7
  • It's ok in my console – Nickool Commented Nov 12, 2014 at 0:17
  • but it will have repeated result I tried this one: wordFrequency("negin!kjs$wkjh*df oiuw& skdjhh"); – Nickool Commented Nov 12, 2014 at 0:24
  • I'm trying to use a big chunck of text and Im getting out of memory every time, even when tried with jsfiddle. :( – diegodacal Commented Nov 12, 2014 at 0:24
  • because when you have space you shouldn't count it as a word – Nickool Commented Nov 12, 2014 at 0:25
  • just remove the space and try again i mean in here : txt.split(/[ .?!,'"]/); remove and instead have this: txt.split(/[.?!,'"]/); – Nickool Commented Nov 12, 2014 at 0:28
 |  Show 2 more ments

5 Answers 5

Reset to default 2

Array.prototype.some expects the given callback to return true or false and returns true as soon as your callback returns true for a given element, otherwise it returns false.

So some iterates over all elements, with your given callback, and your callback checks if the given element text equals the search word and if not adds a new object. Introducing a new element the some function can iterate over.

So to make this clear, for every word thats in the newArray before the word you're searching, you're adding a new object containing your word.

Suppose your newArray looks like this:

[{word:"test"},{word:"another"},{word:"one"},{word:"more"}]

after calling your function for the word even it looks like this:

[{word:"test"},{word:"another"},{word:"one"},{word:"more"},{word:"even"},{word:"even"},{word:"even"},{word:"even"}]

Using Array.prototype.filter would be the better approach here, finding you the matching element, note that I also replaced $.each with Array.prototype.forEach:

function wordFrequency(txt){
  var wordArray = txt.split(/[ .?!,*'"]/);
  var newArray = [], wordObj;
  wordArray.forEach(function (word) {
    wordObj = newArray.filter(function (w){
      return w.text == word;
    });
    if (wordObj.length) {
      wordObj[0].size += 1;
    } else {
      newArray.push({text: word, size: 1});
    }
  });
  return newArray;
}
document.write(JSON.stringify(wordFrequency("count everything, count all the words, count all the words!").sort(function(a,b){return a.size<b.size})).split("},").join("}<br/>"));

It would be simpler and far more efficient to create a direct map from word to frequency, and only afterwards convert that to your array structure. Given an array words create a map of the words:

var freq = words.reduce(function(p, c) {
    p[c] = (p[c] || 0) + 1;
    return p;
}, {});

and the convert that map into your array:

var array = Object.keys(freq).map(function(key) {
   return { text: key, size: freq[key] };
});

To tell the frequency all you need is a hash map approach. Your algorithm is quadratic, since the some method is nested in the each method, so you're always looping over the newArray just to find an entry and increment the size.

A map approach is easily achievable using a JavaScript object. It also gives you constant look-up time, which is better performance than the nested loops approach.

Try this approach instead:

function wordFrequency(txt){
    var wordArray = txt.split(/[ .?!,*'"]/);
    var map = {};
    $.each(wordArray, function(ix, word) {
      // skip empty results
      if (!word.length) {
        return;
      }
      // add word to map
      if (!map[word]) {
        map[word] = 0;
      } 
      map[word]++;
    });
    return map;
}

To use the function:

var text = "hello!world*hello foo  'bar'foo";
var result = wordFrequency(text);

// iterate over results
Object.keys(result).forEach(function(w) {
  console.log(w + ": " + result[w]);
});

// or use for...in
for (var w in result) {
  console.log(w + ": " + result[w]);
}

If you really wanted to, you could then map the result into your desired array format with text and size properties:

var mappedResult = Object.keys(result).map(function(w) {
  return { text: w, size: result[w] };
});
console.log(mappedResult);

Also, depending on your target browsers, you might consider using the array forEach instead of the jQuery $.each, similar to what I did with the Object.keys portion.

Here's the JSBin example.

You would probably want to avoid any iterations on duplicate elements and keep your results array unique. Since any of the iterators of Array.prototype will include each of the elements, they might not be the ideal solution for this. Sometimes plain old loops do the job best ... (You may also want to expressively escape any special characters in your regular expression).

function wordFrequency(txt) {
    var words = txt.split(/[ \.\?!,\*'"]+/),
        seen = [];
    for (var i = 0; i < words.length; i++) {
        var w = words[i],
            found = false;
        for (var j = 0; j < seen.length; j++) {
            if (w === seen[j].text) {
                seen[j].size++;
                found = true;
                break;
            }
        }
        if (!found) seen.push( { text: w, size: 1 } );
    }
    return seen;
}

(Note that the inner for-loop isn't visited for the first word, so the first word will be pushed to the seen-stack and the inner for-loop will start with the second word pared to the first one. Only words that we haven't seen already are added to the seen-stack, making it an array of unique elements.)

And here is the equivalent using Array.prototype.forEach() and Array.prototype.indexOf(), but we have to add another intermediate results stack for the latter one. So we'll have to add another iteration to produce the final result. (We wouldn't have to do this using Array.prototype.findIndex(), but this is not a standard method.)

function wordFrequency2(txt) {
    var words = txt.split(/[ \.\?!,\*'"]+/),
        seen = [],
        freq = [];
    // get frequencies
    words.forEach(function (w) {
        var idx = seen.indexOf(w);
        if (idx >= 0) {
            freq[idx]++;
        }
        else {
            seen.push(w);
            freq.push(1);
        }
    });
    // produce the results array
    var r = [];
    seen.forEach(function (w, idx) {
        r.push( { text: w, size: freq[idx] } );
    });
    return r;
}

Putting optimization into account, the first version using explicit loops will be probably performing faster ...

var words = (function(){

var sWords = document.body.innerText.toLowerCase().trim().replace(/[,;.]/g,'').split(/[\s\/]+/g).sort();
var iWordsCount = sWords.length; // count w/ duplicates

// array of words to ignore
var ignore = ['and','the','to','a','of','for','as','i','with','it','is','on','that','this','can','in','be','has','if'];
ignore = (function(){
var o = {}; // object prop checking > in array checking
var iCount = ignore.length;
for (var i=0;i<iCount;i++){
o[ignore[i]] = true;
}
return o;
}());

var counts = {}; // object for math
for (var i=0; i<iWordsCount; i++) {
var sWord = sWords[i];
if (!ignore[sWord]) {
counts[sWord] = counts[sWord] || 0;
counts[sWord]++;
}
}

var arr = []; // an array of objects to return
for (sWord in counts) {
arr.push({
text: sWord,
frequency: counts[sWord]
});
}

// sort array by descending frequency | http://stackoverflow./a/8837505
return arr.sort(function(a,b){
return (a.frequency > b.frequency) ? -1 : ((a.frequency < b.frequency) ? 1 : 0);
});

}());

(function(){
var iWordsCount = words.length; // count w/o duplicates
for (var i=0; i<iWordsCount; i++) {
var word = words[i];
console.log(word.frequency, word.text);
}
}());

本文标签: Word frequency for array of keyvalues on javascriptStack Overflow