0

I'm trying to learn the datahubframework 5.2.2 and as a part of that implementing a small project.Could someone help me to understand the below points.

  • Whats the main use of creating steps?(Ingestion,mapping).Because as a part of flow,we define the step very clear with inputs and outputs.Whats the need to create step explicitly and what purpose it does?
  • I'm trying to map the data using mapping file but the mapping is not done,same ingested file is loaded into final database with out mapping it.Please help me where i have done wrong.

ingestionmapping.flow.json

{
  "name": "ingestionmapping",
  "description": "This is the default flow containing all of the default steps",
  "batchSize": 100,
  "threadCount": 4,
  "options": {
    "sourceQuery": null
  },
  "steps": {
    "1": {
      "name": "csv-ingest-step-json",
      "description": "ingests json docs in JSON format to data-hub-STAGING",
      "stepDefinitionName": "productIngestion",
      "stepDefinitionType": "INGESTION",
      "customHook" : {
        "module" : "",
        "parameters" : { },
        "user" : "",
        "runBefore" : false
      },
      "batchSize" : 100,
      "threadCount" : 4,
      "fileLocations": {
        "inputFilePath": "input",
        "outputURIReplacement": ".*input*.,'/mapping-flow/json'",
        "inputFileType": "csv"
      },
      "options": {
        "targetDatabase": "data-hub-STAGING",
        "sourceQuery": "cts.collectionQuery([])",
        "permissions": "data-hub-operator,read,data-hub-operator,update",
        "outputFormat": "json",
        "collections": [
          "mapping-flow-ingestion-json"
        ],
        "headers": {
          "sources": [{"name":  "ingestion_only-flow"}],
          "createdOn" : "currentDateTime",
          "createdBy" : "currentUser"
        }
      }
    },
    "2": {
      "name": "mapping-step",
      "description": "This is the default mapping step",
      "stepDefinitionName": "productMapping",
      "stepDefinitionType": "MAPPING",
      "customHook" : {
        "module" : "",
        "parameters" : { },
        "user" : "",
        "runBefore" : false
      },
      "batchSize" : 100,
      "threadCount" : 4,
      "options": {
        "sourceDatabase": "data-hub-STAGING",
        "targetDatabase": "data-hub-FINAL",
        "sourceQuery": "cts.collectionQuery('mapping-flow-ingestion-json')",
        "permissions": "data-hub-operator,read,data-hub-operator,update",
        "outputFormat": "json",
        "collections": [
          "mapping-flow-mapping-json",
          "mdm-content"
        ],
        "targetEntity": "modifiedproduct",
        "mapping": {
          "name": "ingestionmapping-productMapping",
          "version": 1
        },
        "validateEntity": false
      }
    }
  }
}

mapping file:ingestionmapping-productMapping-1.mapping.json

{
  "lang" : "zxx",
  "name" : "ingestionmapping-productMapping",
  "description" : "",
  "version" : 1,
  "targetEntityType" : "http://marklogic.com/modifiedproduct-0.0.1/modifiedproduct",
  "sourceContext" : "/",
  "sourceURI" : "/mapping-flow/json/....json",
  "properties" : {
    "mgame_id" : {
      "sourcedFrom" : "game_id"
    },
    "mSKU" : {
      "sourcedFrom" : "SKU"
    },
    "mtitle" : {
      "sourcedFrom" : "title"
    },
    "mprice" : {
      "sourcedFrom" : "price"
    },
    "mdescription" : {
      "sourcedFrom" : "description"
    },
    "myears_active" : {
      "sourcedFrom" : "years_active"
    },
    "mpublication_date" : {
      "sourcedFrom" : "publication_date"
    },
    "mplayers" : {
      "sourcedFrom" : "players"
    },
    "mage_range" : {
      "sourcedFrom" : "age_range"
    },
    "msetup_time" : {
      "sourcedFrom" : "setup_time"
    },
    "mplaying_time" : {
      "sourcedFrom" : "playing_time"
    },
    "mchance" : {
      "sourcedFrom" : "chance"
    },
    "mcategory" : {
      "sourcedFrom" : "category"
    },
    "mhas_extensions" : {
      "sourcedFrom" : "has_extensions"
    },
    "mhas_accessories" : {
      "sourcedFrom" : "has_accessories"
    },
    "mhas_apparel" : {
      "sourcedFrom" : "has_apparel"
    },
    "mpopularity_tier" : {
      "sourcedFrom" : "popularity_tier"
    },
    "mprobability_apparel" : {
      "sourcedFrom" : "probability_apparel"
    },
    "mprobability_accessories" : {
      "sourcedFrom" : "probability_accessories"
    },
    "mprobability_extensions" : {
      "sourcedFrom" : "probability_extensions"
    }
  }
}

Entity name : modifiedproduct version : 0.0.1

I have tried many times to debug the issue but couldnt able to find where it goes wrong. As a result it stores the same json to final database with out using the mapping attributes.

folder structure: Folder structure screenshot

json file

{
"envelope": {
"headers": {
"sources": [
{
"name": "ingestion_only-flow"
}
], 
"createdOn": "2020-07-02T09:49:57.5876177+02:00", 
"createdBy": "admin", 
"createdUsingFile": "C:\\Users\\Jhansi\\IdeaProjects\\MarklogicDataHubFramework5.2\\input\\board_games.csv"
}, 
"triples": [
], 
"instance": {
"game_id": "1000130", 
"SKU": "177897644317", 
"title": "careful crack", 
"price": "24.95", 
"description": "", 
"years_active": "0", 
"publication_date": "0", 
"players": "2-4", 
"age_range": "", 
"setup_time": "< 5 minutes", 
"playing_time": "1 hour", 
"chance": "High", 
"category": "Board Game", 
"has_extensions": "False", 
"has_accessories": "True", 
"has_apparel": "False", 
"popularity_tier": "3", 
"probability_apparel": "0.3", 
"probability_accessories": "0.3", 
"probability_extensions": "0.3"
}, 
"attachments": null
}
}
  • Yes, I have re-deployed the DH.Here is the content of json I didnt see any error while running the steps.Initially i tried to run the flow without the steps.After some runs, i tried to run the step 2 alone by using the below command gradlew.bat hubRunFlow -PflowName=ingestionmapping -Psteps="2" I even tried by deleting the ingestion data in stage database and re-tried to run the whole flow gradlew.bat hubRunFlow -PflowName=ingestionmapping – Jhansi Lakshmi Jul 02 '20 at 07:54
  • Able to run the flows successfully,it doesn't shows any error.Updated the main question with data and screenshots.Could you please have a look on it. – Jhansi Lakshmi Jul 02 '20 at 18:41

1 Answers1

0

Data Hub would render desired mapping when MarkLogic Entity Services is properly deployed: (Notice the Entity declaration in the mapped document, the key takeaway from all that equation)

enter image description here

https://docs.marklogic.com/datahub//flows/flow-definition.html#flow-definition__custom-step-settings

stepDefinitionName: .....Tip: If you are customizing a default step type (ingestion, mapping, or mastering), leave the value as default-ingestion, default-mapping, or default-mastering....

Once above is reviewed, please follow Data Hub best practice and correct erroneous manual Steps definitions. Below shouldn’t happen if you use Quick Start to create Flow and Steps, given your familiarity with MarkLogic Data Hub.

"steps": {
    "1": {
……………
      "stepDefinitionName": "productIngestion",
      "stepDefinitionType": "INGESTION",
……………

    

"2": {
      "name": "mapping-step",

      "stepDefinitionName": "productMapping",
      "stepDefinitionType": "MAPPING",
…………

        "mapping": {
          "name": "ingestionmapping-productMapping",
  1. Please cleanup your project structure and remove the contents in step-definitions folder. Project structure example ( the Pink part ):

enter image description here

  1. A working example of the Steps definitions is below. When in doubt, please validate the step in QuickStart.
{
  "name" : "ingestionmapping",
  "description" : "",
  "batchSize" : 100,
  "threadCount" : 4,
  "stopOnError" : false,
  "options" : { },
  "version" : 0,
  "steps" : {
    "1" : {
      "name" : "csv-ingest-step-json",
      "description" : "",
      "options" : {
        "additionalCollections" : [ ],
        "headers" : {
          "sources" : [ {
            "name" : "ingestionmapping"
          } ],
          "createdOn" : "currentDateTime",
          "createdBy" : "currentUser"
        },
        "sourceQuery" : "cts.collectionQuery([])",
        "collections" : [ "mapping-flow-ingestion-json" ],
        "permissions" : "data-hub-operator,read,data-hub-operator,update",
        "outputFormat" : "json",
        "targetDatabase" : "store-hub-STAGING"
      },
      "customHook" : {
        "module" : "",
        "parameters" : { },
        "user" : "",
        "runBefore" : false
      },
      "retryLimit" : 0,
      "batchSize" : 100,
      "threadCount" : 4,
      "stepDefinitionName" : "default-ingestion",
      "stepDefinitionType" : "INGESTION",
      "fileLocations" : {
        "inputFilePath" : "/mldhf/STORE/data/products/games",
        "inputFileType" : "csv",
        "outputURIReplacement" : ".*games*.,'/mapping-flow/json'",
        "separator" : ","
      }
    },
    "2" : {
      "name" : "mapping-step",
      "description" : "",
      "options" : {
        "additionalCollections" : [ ],
        "sourceQuery" : "cts.collectionQuery([\"mapping-flow-ingestion-json\"])",
        "mapping" : {
          "name" : "ingestionmapping-mapping-step",
          "version" : 1
        },
        "targetEntity" : "modifiedproduct",
        "sourceDatabase" : "store-hub-STAGING",
        "collections" : [ "mapping-flow-mapping-json", "mdm-content" ],
        "permissions" : "data-hub-operator,read,data-hub-operator,update",
        "validateEntity" : false,
        "sourceCollection" : "csv-ingest-step-json",
        "outputFormat" : "json",
        "targetDatabase" : "store-hub-FINAL"
      },
      "customHook" : {
        "module" : "",
        "parameters" : { },
        "user" : "",
        "runBefore" : false
      },
      "retryLimit" : null,
      "batchSize" : 100,
      "threadCount" : 4,
      "stepDefinitionName" : "entity-services-mapping",
      "stepDefinitionType" : "MAPPING"
    }
  }
}
Fiona Chen
  • 1,358
  • 5
  • 16
  • Thanks for the detailed screenshots Chen.I followed the same as you mentioned and it started working as well.But i have couple of questions,could you please help me to understand those: 1.After mapping i verified the each and every element and attachment has Source envelope data.Is it working as expected?If so,why do we have source envelope in mapped data.2.If we are using default-ingestion and mapping then whats the use of custom steps and when will be using those ?any examples?3.whats the use of Custom hook?4.Can we use the custom steps if we have logic to perform while mapping the data? – Jhansi Lakshmi Jul 03 '20 at 11:04