I am given the following CSV file which I extracted from an excel spreadsheet. Just to give some background information which could be of assistance, it discusses AGI Numbers (think of it as protein identifiers), unmodified peptide sequences for those protein identifiers, and then modified peptide sequences with modifications made on the unmodified sequences, the index/indeces of those modifications, and then the combined spectral count for repeated peptides. The text file is called MASP.GlycoModReader.txt and the information is in the following format below:
AGI,UnMd Peptide (M) = x,Mod Peptide (oM) = Ox,Index/Indeces of Modification,counts,Combined
Spectral count for repeated Peptides
AT1G56070.1,NMSVIAHVDHGKSTLTDSLVAAAGIIAQEVAGDVR,NoMSVIAHVDHGKSTLTDSLVAAAGIIAQEVAGDVR,2,17
AT1G56070.1,LYMEARPMEEGLAEAIDDGR,LYoMEARPoMEEGLAEAIDDGR,"3, 9",1
AT1G56070.1,EAMTPLSEFEDKL,EAoMTPLSEFEDKL,3,7
AT1G56070.1,LYMEARPMEEGLAEAIDDGR,LYoMEARPoMEEGLAEAIDDGR,"3, 9",2
AT1G56070.1,EGPLAEENMR,EGPLAEENoMR,9,2
AT1G56070.1,DLQDDFMGGAEIIK,DLQDDFoMGGAEIIK,7,1
The output file that needs to result after extracting the above is in the following format below:
AT1G56070.1,{"peptides": [{"sequence": "NMSVIAHVDHGKSTLTDSLVAAAGIIAQEVAGDVR", "mod_sequence":
"NoMSVIAHVDHGKSTLTDSLVAAAGIIAQEVAGDVR" , "mod_indeces": 2, "spectral_count": 17}, {"sequence":
"LYMEARPMEEGLAEAIDDGR" , "mod_sequence": "LYoMEARPoMEEGLAEAIDDGR", "mod_indeces": [3, 9],
"spectral_count": 3}, {"sequence": "EAMTPLSEFEDKL" , "mod_sequence": "EAoMTPLSEFEDKL",
"mod_indeces": [3,9], "spectral_count": 7}, {"sequence": "EGPLAEENMR", "mod_sequence":
"EGPLAEENoMR", "mod_indeces": 9, "spectral_count": 2}, {"sequence": "DLQDDFMGGAEIIK",
"mod_sequence": "DLQDDFoMGGAEIIK", "mod_indeces": [7], "spectral_count": 1}]}
I have provided my solution below: If anyone has a better solution in another language or can possibly analyze mine and let me know if there are more efficient methods of coming about this, then please comment below. Thank you.
#!/usr/bin/env node
var fs = require('fs');
var csv = require('csv');
var data ="proteins.csv";
/* Uses csv nodejs module to parse the proteins.csv file.
* Parses the csv file row by row and updates the peptide_arr.
* For new entries creates a peptide object, for similar entries it updates the
* counts in the peptide object with the same AGI#.
* Uses a peptide object to store protein ID AGI#, and the associated data.
* Writes all formatted peptide objects to a txt file - output.txt.
*/
// Tracks current row
var x = 0;
// An array of peptide objects stores the information from the csv file
var peptide_arr = [];
// csv module reads row by row from data
csv()
.from(data)
.to('debug.csv')
.transform(function(row, index) {
// For the first entry push a new peptide object with the AGI# (row[0])
if(x == 0) {
// cur is the current peptide read into row by csv module
Peptide cur = new Peptide( row[0] );
// Add the assoicated data from row (1-5) to cur
cur.data.peptides.push({
"sequence" : row[1];
"mod_sequence" : row[2];
if(row[5]){
"mod_indeces" : "[" + row[3] + ", " + row[4] + "]";
"spectral_count" : row[5];
} else {
"mod_indeces" : row[3];
"spectral_count" : row[4];
}
});
// Add the current peptide to the array
peptide_arr.push(cur);
}
// Move to the next row
x++;
});
// Loop through peptide_arr and append output with each peptide's AGI# and its data
String output = "";
for(var peptide in peptide_arr)
{
output = output + peptide.toString()
}
// Write the output to output.txt
fs.writeFile("output.txt", output);
/* Peptide Object :
* - id:AGI#
* - data: JSON Array associated
*/
function Peptide(id) // this is the actual function that does the ID retrieving and data
// storage
{
this.id = id;
this.data = {
peptides: []
};
}
/* Peptide methods :
* - toJson : Returns the properly formatted string
*/
Peptide.prototype = {
toString: function(){
return this.id + "," + JSON.stringify(this.data, null, " ") + "/n"
}
};
Edited note: It seems when I run this solution I posted, I am getting a memory leak error; it is infinitely running while not producing any substantial, readable output. If anyone could be willing to assist in assessing why this is occurring, that would be great.