3

Hi I have following tables:

create table Features
(
FeatureId bigint,
FeatureName varchar(255),
ParentId bigint
)

insert into Features values(10, 'Feature 1', 1);
insert into Features values(11, 'Feature 2', 10);
insert into Features values(12, 'Feature 3', 11);
insert into Features values(13, 'Feature 4', 2);
insert into Features values(14, 'Feature 5', 13);

insert into Features values(15, 'Feature 6', 3);
insert into Features values(16, 'Feature 7', 15);
insert into Features values(17, 'Feature 8', 16);
insert into Features values(18, 'Feature 9', 17);
insert into Features values(19, 'Feature 10', 18);
insert into Features values(20, 'Feature 11', 19);
insert into Features values(21, 'Feature 12', 12);


create table Scenarios
(
ScenarioId bigint,
ParentId bigint,
ScenarioTitle varchar(25)
)

insert into Scenarios values(1, 0, 'Scenario 1')
insert into Scenarios values(2, 0, 'Scenario 2')
insert into Scenarios values(3, 0, 'Scenario 3')

Here, a feature can have either another feature as parent or a scenario as parent. For scenario, parent id can either be 0, or another scenario.

I would like to get path of each feature as follows:

FeatureId   ParentId    FeatureName PathString          PathLength
10          1           Feature 1   1                   0
11          10          Feature 2   1/10                1
12          11          Feature 3   1/10/11             2
13          2           Feature 4   2                   0
14          13          Feature 5   2/13                1
15          3           Feature 6   3                   0
16          15          Feature 7   3/15                1
17          16          Feature 8   3/15/16             2
18          17          Feature 9   3/15/16/17          3
19          18          Feature 10  3/15/16/17/18       4
20          19          Feature 11  3/15/16/17/18/19    5
21          12          Feature 12  1/10/11/12          3

Since I would like to collect this result in a temp table for further processing, I tried select into and Azure SQL DW throws Using SELECT INTO statement is not supported in Parallel Data Warehouse. Modify the statement and re-try executing it.

Here is my query (may not be in great shape as I am still figuring out recursive sql)

drop table FeaturesWithPath;
;WITH FeaturePaths (FeatureId, ParentId, FeatureName, PathString)
AS
(
-- Anchor member definition
    SELECT g.FeatureId, g.ParentId, g.FeatureName, cast(CAST(g.ParentId as nvarchar(max)) as varchar(max)) as PathString
    FROM dbo.Features AS g
    UNION ALL
-- Recursive member definition
    SELECT g.FeatureId, g.ParentId, g.FeatureName, PathString + '/' + cast(g.ParentId as varchar(max))
    FROM dbo.Features AS g
    INNER JOIN FeaturePaths AS gp
        ON g.ParentId = gp.FeatureId
)

SELECT FeatureId, ParentId, FeatureName, PathString into FeaturesWithPath FROM FeaturePaths;
--select * from FeaturesWithPath order by FeatureId;

drop table FeaturesWithPathLength;
select *, LEN(PathString) - LEN(REPLACE(PathString, '/', '')) as PathLength into FeaturesWithPathLength from FeaturesWithPath
--select * from FeaturesWithPathLength order by FeatureId

drop table MaxFeaturePathLenghtRowTable;

select * into MaxFeaturePathLenghtRowTable
from FeaturesWithPathLength
where PathLength = (select max(PathLength) from FeaturesWithPathLength as f where f.FeatureId = FeaturesWithPathLength.FeatureId)
or PathLength = (select max(PathLength) from FeaturesWithPathLength as f where f.FeatureId = FeaturesWithPathLength.FeatureId
    and PathLength > (select max(PathLength) from FeaturesWithPathLength as f2 where f2.FeatureId = FeaturesWithPathLength.FeatureId));

--select * from MaxFeaturePathLenghtRowTable order by FeatureId

drop table FeaturesPerParentTable
select FeatureId, [value] as NewParentId, FeatureName, COALESCE(NULLIF(SUBSTRING(PathString, 0, CHARINDEX('/', PathString)), ''), [value]) AS ScenarioId into FeaturesPerParentTable
    from MaxFeaturePathLenghtRowTable 
    cross apply STRING_SPLIT (PathString, '/') cs order by FeatureId

select * from FeaturesPerParentTable order by FeatureId;

I tried to convert the CTE to use CTAS which did not work either.

This is how I tried CTAS:

;WITH FeaturePaths (FeatureId, ParentId, FeatureName, PathString)
AS
(
-- Anchor member definition
    SELECT g.FeatureId, g.ParentId, g.FeatureName, cast(CAST(g.ParentId as nvarchar(max)) as varchar(max)) as PathString
    FROM dbo.Features AS g
    --WHERE parentId=0
    UNION ALL
-- Recursive member definition
    SELECT g.FeatureId, g.ParentId, g.FeatureName, PathString + '/' + cast(g.ParentId as varchar(max))
    FROM dbo.Features AS g
    INNER JOIN FeaturePaths AS gp
        ON g.ParentId = gp.FeatureId
)

CREATE TABLE #tmp_fct
WITH
(
    DISTRIBUTION = ROUND_ROBIN
)
AS
SELECT  FeatureId, ParentId, FeatureName, PathString
FROM    FeaturePaths;

Now I am wondering if there is a way to get path for each Feature on Azure SQL DW and collect result in to a table.

-- UPDATE --

For solution in SQL see this

Here is solution in C#

void Main()
{
    var scenarios = new List<Scenario> {
        new Scenario{Id = 1, Title = "Scenario 1", ParentId = 0},
        new Scenario{Id = 2, Title = "Scenario 2", ParentId = 0},
        new Scenario{Id = 3, Title = "Scenario 3", ParentId = 0},
    };

    var features = new List<Feature> {
            new Feature{Id =10, Title = "Feature 1", ParentId =1},
            new Feature{Id =11, Title = "Feature 2", ParentId =10},
            new Feature{Id =12, Title = "Feature 3", ParentId =11},
            new Feature{Id =13, Title = "Feature 4", ParentId =2},
            new Feature{Id =14, Title = "Feature 5", ParentId =13},

            new Feature{Id =15, Title = "Feature 6", ParentId =3},
            new Feature{Id =16, Title = "Feature 7", ParentId =15},
            new Feature{Id =17, Title = "Feature 8", ParentId =16},
            new Feature{Id =18, Title = "Feature 9", ParentId =17},
            new Feature{Id =19, Title = "Feature 10", ParentId =18},
            new Feature{Id =20, Title = "Feature 11", ParentId =19},
            new Feature{Id =21, Title = "Feature 12", ParentId =12}
        };

    var scenarioIds = new HashSet<long>(scenarios.Select(x => x.Id));

    //get path
    IList<Feature> withPath = features.Select(x => { x.Path = GetPath(x, features, scenarioIds); return x; }).ToList().Dump("With path");
}

private string GetPath(Feature f, IList<Feature> features, HashSet<long> scenarioIds)
{
    if (scenarioIds.Contains(f.ParentId))
    {
        return f.ParentId.ToString();
    }
    else
    {
        var parent = features.First(d => d.Id == f.ParentId);
        return GetPath(parent, features, scenarioIds) + "/" + f.ParentId;
    }
}

public class Scenario
{
    public long Id { get; set; }
    public string Title { get; set; }
    public long ParentId { get; set; }
}

public class Feature
{
    public long Id { get; set; }
    public string Title { get; set; }
    public long ParentId { get; set; }
    public string Path { get; set; } //temp
}
Community
  • 1
  • 1
Amit
  • 25,106
  • 25
  • 75
  • 116
  • Instead of the construct `SELECT ... INTO ...` create the temporary table beforehand. So `CREATE TABLE ...; INSERT INTO (...) SELECT ...`. PS: Use real temporary tables, ie tablenames starting with `#`. Eg `CREATE TABLE #MaxFeaturePathLenghtRowTable(...)`. – TT. Oct 26 '16 at 07:16
  • @TT. It throws `Common table expressions followed by INSERT, UPDATE, DELETE or MERGE are not supported in this version` If I try Insert Into. `Select Into` is the only working statement. – Amit Oct 26 '16 at 07:27
  • Ah yes, Azure and its silly restrictions. Apologies, I was fixed on the sql-server & sql tags... – TT. Oct 26 '16 at 07:38
  • Can you provide an example of what you tried with CTAS in your question, and the error that it produced? Looking at [Features and Limitations of Common Table Expressions in SQL Data Warehouse and Parallel Data Warehouse](https://msdn.microsoft.com/en-us/library/ms175972.aspx#Anchor_5) that at least should be supported in Azure: `A CTE can be specified in a CREATE TABLE AS SELECT (CTAS) statement.` – TT. Oct 26 '16 at 07:46
  • Oh scratch that last remark; from the same link: `A common table expression that includes references to itself (a recursive common table expression) is not supported.`. You will have to resort to a solution that uses an approach with CURSOR's... – TT. Oct 26 '16 at 07:48
  • I am wondering if I should just write this up in LINQ and dump result to a table for further processing – Amit Oct 26 '16 at 07:52
  • That is always a possibility. An approach with one or more CURSORs would at least keep the data inside the database layer so you don't have to transport it to another process or computer (would involve network traffic). All depends on your architecture and your requirements. – TT. Oct 26 '16 at 08:05
  • I'm saying CURSORs, but I mean any way that doesn't use recursive CTEs and that is supported in Azure. My personal preference is always to keep bulk data in the database layer and algorithms involving a lot of data (usually) benefit from that. But with all of Azure's restrictions on T-SQL, it might be a lot easier to just write it up in another layer entirely. YMMV. – TT. Oct 26 '16 at 08:15
  • @TT Cursors are not supported in Azure SQL Data Warehouse either. You will need to use a more traditional loop. I will come back to you in a few hours with an example. – wBob Oct 26 '16 at 08:38
  • @wBob Ah yes, supported in regular Azure but not in warehouse or warehouse parallel version (https://msdn.microsoft.com/en-us/library/ms180169.aspx). Looks like the OP will have to do the implementation of his problem in another layer. Painful... – TT. Oct 26 '16 at 09:04
  • No @TT, this is possible with a simple loop. I'll work up an example but day-job in the way at the moment : ) – wBob Oct 26 '16 at 10:11
  • Thanks for looking into guys, I could do it in c# with little recursion and linq and took only 10 mins..! @wBob, if could find a solution, please do post. – Amit Oct 26 '16 at 10:13

2 Answers2

6

As Azure SQL Data Warehouse does not support recursive CTEs or cursors at this time, you could do this with a good old-fashioned loop, eg:

-- Loop thru Features
DECLARE @counter INT = 1;

-- Insert first record where no parent exists
IF OBJECT_ID('tempdb..#features') IS NOT NULL DROP TABLE #features;

CREATE TABLE #features
WITH
    (
    DISTRIBUTION = HASH ( FeatureId ),
    LOCATION = USER_DB
    )
AS
WITH cte AS
(
SELECT 1 AS xlevel, p.FeatureId, p.ParentId, p.FeatureName, CAST( p.ParentId AS VARCHAR(255) ) AS PathString, 0 AS PathLength
FROM dbo.Features p
WHERE NOT EXISTS 
    (
    SELECT *
    FROM dbo.Features c
    WHERE p.ParentId = c.FeatureId
    )
)
SELECT *
FROM cte;


SELECT 'before' s, * FROM #features ORDER BY FeatureId;

-- Loop recursively through the child records
WHILE EXISTS (
SELECT *
    FROM #features p
        INNER JOIN dbo.features c ON p.FeatureId = c.ParentId
    WHERE p.xlevel = @counter
    )
BEGIN

    -- Insert next level
    INSERT INTO #features ( xlevel, FeatureId, ParentId, FeatureName, PathString, PathLength )
    SELECT @counter + 1 AS xlevel, c.FeatureId, c.ParentId, c.FeatureName, p.PathString + '/' + CAST( c.ParentId AS VARCHAR(255) ) AS PathString, @counter AS PathLength
    FROM #features p
        INNER JOIN dbo.features c ON p.FeatureId = c.ParentId
    WHERE p.xlevel = @counter;

    SET @counter += 1;

    -- Loop safety
    IF @counter > 99
    BEGIN 
        RAISERROR( 'Too many loops!', 16, 1 ) 
        BREAK 
    END;

END


SELECT 'after' s, *  FROM #features ORDER BY FeatureId;

Full code including setup is available here.

My results: My results

Hope that helps.

wBob
  • 13,710
  • 3
  • 20
  • 37
  • Thanks @wBob this is perfect – Amit Oct 26 '16 at 17:48
  • I noticed it runs super slow, may be because of too many loops. – Amit Oct 26 '16 at 17:57
  • ok, it runs in 3 seconds on my ADW. Do you have a bigger dataset? We could look at tuning it, for example my initial version was 'while @counter < 5' so that would be more efficient if you knew roughly how many levels you had. – wBob Oct 26 '16 at 18:03
  • same data as above, running for Visual Studio, takes exactly 30 seconds! My actual data will be around 100K rows. – Amit Oct 26 '16 at 18:45
  • What DWU? To be fair I was only running at 200, the dataset is so small. – wBob Oct 26 '16 at 19:04
  • Also, what's the spec of the tables? Distribution and Partitioning? I used HEAP and distributed on Hash FeatureId. – wBob Oct 26 '16 at 19:10
1

Why not create the FeaturesWithPath table beforehand and insert into it using the following pseudocode?


CREATE TABLE FeaturesWithPath (FeatureId type, ParentId type, FeatureName type, PathString type)

;with FeaturePaths (FeatureId, ParentId, FeatureName, PathString)
AS
(
-- Anchor member definition
    SELECT g.FeatureId, g.ParentId, g.FeatureName, cast(CAST(g.ParentId as nvarchar(max)) as varchar(max)) as PathString
    FROM dbo.Features AS g
    UNION ALL
-- Recursive member definition
    SELECT g.FeatureId, g.ParentId, g.FeatureName, PathString + '/' + cast(g.ParentId as varchar(max))
    FROM dbo.Features AS g
    INNER JOIN FeaturePaths AS gp
        ON g.ParentId = gp.FeatureId
)
insert FeaturesWithPath (FeatureId, ParentId, FeatureName, PathString)
SELECT FeatureId, ParentId, FeatureName, PathString FROM FeaturePaths;

knockout
  • 139
  • 4
  • I get this exception: `Common table expressions followed by INSERT, UPDATE, DELETE or MERGE are not supported in this version.` – Amit Oct 26 '16 at 07:23
  • Amit, see example F on [this page](https://msdn.microsoft.com/en-us/library/ms175972.aspx). Seems like it will give you something similar to what you are looking for without needing an insert statement. – knockout Oct 26 '16 at 07:35
  • I should still collect result in to another table for further processing. The example result I shared above is just beginning of the problem I am solving. – Amit Oct 26 '16 at 07:38
  • I've never used PDW before, but if it supports while loops, you can use that instead of a recursive cte ` while (there isn't a row that hasn't been mapped to its ancestor) begin update feature set pathstring = pathstring + '/' + parent where pathstring isnt fully populated end ` – knockout Oct 26 '16 at 08:12