I looked at several of the suggested solutions but none seemed to rise to this confounding data formatting challenge.
I have a huge JSON file (over 100k rows) and massive duplicates of data all as top level objects. Here's an example:
[
{
"manufacturer":"Samsung",
"device":"Galaxy A32 5G",
"model":"SM-A326B",
"chipset":"Mediatek MT6853V/NZA",
"date":"2022-01-01",
"fw_id":"A326BXXS4AVA1",
"android":"R(Android 11)",
"known_passcode":false,
"afu":false,
"bfu":false,
"bruteforce":false
},
{
"manufacturer":"Samsung",
"device":"Galaxy A32 5G",
"model":"SM-A326U",
"chipset":"Mediatek MT6853V/NZA",
"date":"2021-03-01",
"fw_id":"A326USQU1AUD4",
"android":"R(Android 11)",
"known_passcode":true,
"afu":false,
"bfu":true,
"bruteforce":true
},
{
"manufacturer":"Samsung",
"device":"Galaxy A32 5G",
"model":"SM-A326U1",
"chipset":"Mediatek MT6853V/NZA",
"date":"2021-09-01",
"fw_id":"A326U1UEU5AUJ2",
"android":"R(Android 11)",
"known_passcode":true,
"afu":false,
"bfu":true,
"bruteforce":true
},
{
"manufacturer":"LGE",
"device":"LG K31",
"model":"LGL355DL",
"chipset":"Mediatek MT6762",
"date":"unknown",
"fw_id":"L355DL10l",
"android":"unknown",
"known_passcode":false,
"afu":false,
"bfu":false,
"bruteforce":false
}
]
This needs to be organized so that data points like manufacturer, device, model are not duplicated hundreds of times.
Btw, here's a JSFiddle to play with: https://jsfiddle.net/xpancom/Lq7duahv/
Ideally, the JSON format would be the following:
[
{
"manufacturers": [
{
"manufacturer": "Samsung",
"devices": [
{
"device": "Galaxy A32 5G",
"models": [
{
"model": "SM-A326B",
"data": [
{
"chipset": "Mediatek MT6853V/NZA",
"date": "2022-01-01",
"fw_id": "A326BXXS4AVA1",
"android": "R(Android 11)",
"known_passcode": false,
"afu": false,
"bfu": false,
"bruteforce": false
},
{
"chipset": "Mediatek MT6853V/NZA",
"date": "2021-09-01",
"fw_id": "A326BXXU3AUH7",
"android": "R(Android 11)",
"known_passcode": true,
"afu": false,
"bfu": true,
"bruteforce": true
}
]
},
{
"model": "SM-A326U1",
"data": [
{
"chipset": "Mediatek MT6853V/NZA",
"date": "2021-09-01",
"fw_id": "A326U1UEU5AUJ2",
"android": "R(Android 11)",
"known_passcode": true,
"afu": false,
"bfu": true,
"bruteforce": true
}
]
}
]
}
]
},
{
"manufacturer": "LGE",
"devices": [
{
"device": "LG K31",
"models": [
{
"model": "SM-A326B",
"data": [
{
"chipset": "Mediatek MT6762",
"date": "unknown",
"fw_id": "L355DL10l",
"android": "unknown",
"known_passcode": false,
"afu": false,
"bfu": false,
"bruteforce": false
}
]
}
]
}
]
}
]
}
]
Working in React, here's what I've got so far in trying to massage this data:
const source = data;
const destination = [];
const classifiedTokens = []; // will be used to stored already classified tokens
const classifiedTokensModel = []; // will be used to stored already classified tokens for models
const getNextTokenArray = (source) => {
let unusedToken = null;
const nextTokenArray = source.filter(function (element) {
if (!unusedToken && !classifiedTokens.includes(element['device'])) {
unusedToken = element['device'];
classifiedTokens.push(unusedToken);
}
return unusedToken ? unusedToken === element['device'] : false;
});
return unusedToken ? nextTokenArray : null;
};
// Pass in arrays deconstructed from addToDestination to process third tier nested objects for models
const getNextTokenArrayModel = (tokenObject) => {
let tokenObjectDevice = tokenObject['device'];
let tokenObjectData = tokenObject['data'];
let unusedTokenModel = null;
const nextTokenArrayModel = tokenObjectData.filter(function (element) {
if (!unusedTokenModel && !classifiedTokensModel.includes(element['model'])) {
unusedTokenModel = element['model'];
classifiedTokensModel.push(unusedTokenModel);
}
return unusedTokenModel ? unusedTokenModel === element['model'] : false;
});
//return unusedTokenModel ? nextTokenArrayModel : null;
if (unusedTokenModel) {
if (nextTokenArrayModel.length === 0) return;
let res = {
device: tokenObjectDevice,
model: nextTokenArrayModel[0]['model'],
data: [],
};
nextTokenArrayModel.forEach((element) => {
res.data.push({
manufacturer: element.manufacturer,
chipset: element.chipset,
date: element.date,
fw_id: element.fw_id,
android: element.android,
knownPasscode: element.knownPasscode,
afu: element.afu,
bfu: element.bfu,
bruteforce: element.bruteforce,
});
});
destination.push(res);
} else {
return null;
}
};
const addToDestination = (tokenArray) => {
if (tokenArray.length === 0) return;
let res = {
device: tokenArray[0]['device'],
data: [],
};
tokenArray.forEach((element) => {
res.data.push({
manufacturer: element.manufacturer,
model: element.model,
chipset: element.chipset,
date: element.date,
fw_id: element.fw_id,
android: element.android,
knownPasscode: element.knownPasscode,
afu: element.afu,
bfu: element.bfu,
bruteforce: element.bruteforce,
});
});
getNextTokenArrayModel(res); // Call this to process and group nested model duplicates by device
//destination.push(res);
};
let nextTokenArray = getNextTokenArray(source);
while (nextTokenArray) {
addToDestination(nextTokenArray);
nextTokenArray = getNextTokenArray(source);
}
setTimeout(() => {
document.getElementById('root').innerHTML =
'<pre>' + JSON.stringify(destination, null, 2) + '</pre>';
}, 1000);
};
And here's the JSFiddle again: https://jsfiddle.net/xpancom/Lq7duahv/
Who can smash this data formatting dilemma?