0

I am having a huge data in a file that i need to read and do some probabilities on it, so i need to count the number of occurances of each word in the whole file and do some more calculations on it. the files contains 1 million and half records and each record is about 6 strings. I used a vector to save this data but the program crashes after saving about 8000 records. Is there a way of saving this vector on the computer and not on the memory of the program ?!.. or i heard something called symbol table from searching but i couldn't understand what does it mean or how to use it.

any solution to this problem ?

This is the Mainfile

#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>

#include "Tuple.h"
#include "VerbPair.h"
using namespace std;

string filename = "verb-argument-tuples.txt";
vector<Tuple> mytuples;
vector<VerbPair> verbpairs;

vector<Tuple> readTupleFile(string filename)
{
    cout << "Started parsing the file of tuples..." << endl;
    vector<Tuple> mt;
    string temp;
    Tuple t;

    ifstream infile;
    infile.open(filename);
    while(!(infile.eof()))
    {
        getline(infile,temp);
        t.parseTuple(temp);
        mt.push_back(t);
    }

    infile.close();
    cout << "Done with reading tuples file..." << endl;
    return mt;
}

vector<VerbPair> getVerbPairs(vector<Tuple> mytuples)
{
    vector<VerbPair> pairs;
    bool flag = false;
    VerbPair temp;
    for(int i=0;i<mytuples.size();i++)
    {
        flag = false;
        for(int h=0;h<pairs.size();h++)
        {
            if (mytuples[i].verb.compare(pairs[h].verb) == 0)
            {
                pairs[h].freq += mytuples[i].count;
                flag =true;
                break;
            }
        }
        if(! flag)
        {
            temp.verb = mytuples[i].verb;
            temp.freq = mytuples[i].count;
            pairs.push_back(temp);
        }
    }
    return pairs;
}

int numOfLines(string filename)
{
    int numLines = 0;
    string j ="";
    ifstream infile;
    infile.open(filename);

    while(!infile.eof())
    {
        getline(infile,j);
        numLines++;
    }
    infile.close();
    return numLines;
}

void train(string filename)
{
    mytuples = readTupleFile(filename);
    verbpairs = getVerbPairs(mytuples);
}
void store(string filename)
{

}
void load(string filename)
{

}

int main()
{
    cout << "Started Application..." << endl;
    train(filename);
    cout << "Size of verb pairs is " << verbpairs.size() << endl;
}

Tuple.h

#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>
using namespace std;

class Tuple
{
public:
    int count;
    string verb;
    string frame;
    vector<string> args;
private:
    int i;
    int h;
    string p;

public:
    void parseTuple(string s)
    {
        cout << "parsing.... " << s << endl;
        i=0;
        h=0;
        p="";
        while(s[i] != 32 && s[i]!= 9) //that means temp[i] is a number
        {
            h = h*10 + (s[i] - '0');
            i++;
        }
        this->count = h;
        i++;

        // loops for everything but not the space and tab
        while(s[i] != 32 && s[i]!= 9)
        {
            p +=s[i];
            i++;
        }
        this->verb = p;
        i++;

        p="";
        while(s[i] != 32 && s[i]!= 9)
        {
            p +=s[i];
            i++;
        }
        this->frame = p;
        i++;

        p="";
        while(i < s.length())
        {
            while(s[i] != 32 && s[i]!= 9 && i < s.length())
            {
                p += s[i];
                i++;
            }
            this->args.push_back(p);
            i++;
            p="";
        }
    }
};

and VerbPair.h

#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <istream>
using namespace std;

class VerbPair
{
public:
    string verb;
    int freq;
};
Community
  • 1
  • 1
Michael Girgis
  • 145
  • 4
  • 14
  • 3
    it doesn't sound like a memory problem, could you show the code? -- or perhaps the error? – Martin Kristiansen May 16 '12 at 08:28
  • Don't use a vector for this. Use a deque or list. – David Schwartz May 16 '12 at 08:53
  • @DavidSchwartz: according to a graph that Stoustrup shows in his lecture(http://channel9.msdn.com/Events/GoingNative/GoingNative-2012/Keynote-Bjarne-Stroustrup-Cpp11-Style) trees and pointer-based lists are evil in this sorta context. But you are right any algo-course will tell you to use directed queues or linked lists. – Martin Kristiansen May 16 '12 at 09:44
  • The problem with using a vector is that it needs to be resized and it needs to allocate contiguous memory. Interspersing the resizing of the vector with the allocation of the objects in it can lead to an allocation failure due to insufficient contiguous virtual memory. Vectors are not suitable for highly dynamic data structures. – David Schwartz May 16 '12 at 09:57
  • @DavidSchwartz: That is simply not true, I know its 'text-book-true', but running experiments will tell you that the complexity argument at best needs larger data, at worst does not hold. if you look at the graph in Stoustrups Paper(page 51, figure 1) "Software Development for Infrastructure" you will see that for this sort of operations nothing beats a std::vector. – Martin Kristiansen May 16 '12 at 10:52
  • Where does your code fail? Can you spot the function that stops the show? perhaps using a debugger? – Martin Kristiansen May 16 '12 at 12:13

3 Answers3

1

Can you try with using reserve function with vector. Since you possibly know that you have large data, you should also use reserve function.

Also, use map in this case, since using map, you will be able to count the number of occurences easily.

For the crash, you will have to show us the code.

0

Since there is duplicate data, why are you using vector. Just use a map<string,int>. Each time you encounter a word, increment the corresponding value in the map.

Luchian Grigore
  • 253,575
  • 64
  • 457
  • 625
  • 1
    even though this is right, it doesn't really relate to his problem. 8000 records should in no way crash a `std::vector`. unless he is doing something strange -- some sort of recursion done wrong or something. – Martin Kristiansen May 16 '12 at 08:30
0

You have alot of shadow variables in you code, like the fact that you declare the filename variable globally and then use it locally three lines later. you do the same with the tuple vector and the verbpair vector.

Perhaps some encapsulation would make your debugging task easier.

Another style issue would be a function like:

vector<VerbPair> getVerbPairs(vector<Tuple> mytuples)
{
    vector<VerbPair> pairs;
    bool flag = false;
    VerbPair temp;
    for(int i=0;i<mytuples.size();i++)
    {
        flag = false;
        for(int h=0;h<pairs.size();h++)
        {
            if (mytuples[i].verb.compare(pairs[h].verb) == 0)
            {
                pairs[h].freq += mytuples[i].count;
                flag =true;
                break;
            }
        }
        if(! flag)
        {
            temp.verb = mytuples[i].verb;
            temp.freq = mytuples[i].count;
            pairs.push_back(temp);
        }
    }
    return pairs;
}

The are a few things that make it hard to debug. first one is the shadow thing, second one is that you don't let the compiler help you.

vector<VerbPair> getVerbPairs(const vector<Tuple>& mytuples)
{
  vector<VerbPair> pairs;
  bool flag = false;
  VerbPair temp;
  for(int i=0;i<mytuples.size();i++)
    {
      flag = false;
      for(int h=0;h<pairs.size();h++)
    {
      if (mytuples[i].verb.compare(pairs[h].verb) == 0)
        {
          pairs[h].freq += mytuples[i].count;
          flag =true;
          break;
        }
    }
      if(! flag)
    {
      temp.verb = mytuples[i].verb;
      temp.freq = mytuples[i].count;
      pairs.push_back(temp);
    }
    }
  return pairs;
}

This way the compiler will tell you if you try to mess around with the mytupes vector.

Martin Kristiansen
  • 9,875
  • 10
  • 51
  • 83