-1

A table contains a single column of text type. The text contains one or more lines where each lines contains one or more sections created from letters. The goal is to write a query that returns a tuple for each section with its start point ,end point and value.


Data sample

create table t (txt varchar (1000));

insert into t (txt) values
(
'
AAAAAAAAAAAAAAAAAAAAAAAAAAAA  BBBB    CCCCCCCCCCCCCCCCCCCCCCCCC
DDDE  FFFFFFFF    GGGGGGGGG               HHHHHHHH    IIIIIII
JJ      KKKLLL       MM NN                              OOOOO
            P                                              QQ
'
)
;

Requested results

* Only the last 3 columns (section start/end/val) are required, the rest are for debugging purposes.

line_ind    section_ind   section_length  section_start  section_end   section_val
1           1             28              1              28            A
1           2             4               31             34            B
1           3             25              39             63            C
2           1             3               1              3             D
2           2             1               4              4             E
2           3             8               7              14            F
2           4             9               19             27            G
2           5             8               43             50            H
2           6             7               55             61            I
3           1             2               1              2             J
3           2             3               9              11            K
3           3             3               12             14            L
3           4             2               22             23            M
3           5             2               25             26            N
3           6             5               57             61            O
4           1             1               13             13            P
4           2             2               60             61            Q
Community
  • 1
  • 1
David דודו Markovitz
  • 42,900
  • 6
  • 64
  • 88

2 Answers2

0

Teradata

  • Currently regexp_split_to_table doesn't seem to support zero-length expression (I've created incident RECGZJKZV). In order to overcome this limitation I'm using regexp_replace to push space between adjacent sequences of letters, e.g. KKKLLL

with        l
            as
            (
                select      line_ind
                           ,line

                from        table
                            (
                                regexp_split_to_table (-1,t.txt,'\r','')
                                returns (minus_one int,line_ind int,line varchar(1000))
                            )   
                            as l
            )

select      l.line_ind
           ,r.section_ind                                           
           ,char_length     (r.section)                                 as section_length
           ,regexp_instr    (l.line,'(\S)\1*',1,r.section_ind,0)        as section_start
           ,regexp_instr    (l.line,'(\S)\1*',1,r.section_ind,1) - 1    as section_end
           ,substr          (r.section,1,1)                             as section_val

from        table
            (
                regexp_split_to_table (l.line_ind,regexp_replace (l.line,'(?<=(?P<c>.))(?!(?P=c))',' '),'\s+','')
                returns (line_ind int,section_ind int,section varchar(1000))
            )  
            as r
           ,l

where       l.line_ind  =
            r.line_ind

order by    l.line_ind
           ,r.section_ind   
;

Oracle

select      regexp_instr  (txt,'(\S)\1*',1,level,0)       - instr (txt,chr(10),regexp_instr (txt,'(\S)\1*',1,level,0) - length (txt) - 1,1)   as section_start
           ,regexp_instr  (txt,'(\S)\1*',1,level,1) - 1   - instr (txt,chr(10),regexp_instr (txt,'(\S)\1*',1,level,0) - length (txt) - 1,1)   as section_end
           ,regexp_substr (txt,'(\S)\1*',1,level,'',1)                                                                                        as section_val

from        t

connect by  level <= regexp_count (txt,'(\S)\1*')
;
David דודו Markovitz
  • 42,900
  • 6
  • 64
  • 88
0

Oracle

This will work even if you have multiple input rows:

WITH lines ( txt, id, line, pos, line_no ) AS(
  SELECT txt,
         id,
         REGEXP_SUBSTR( txt, '.*', 1, 1 ),
         REGEXP_INSTR( txt, '.*', 1, 1, 1 ),
         1
  FROM   t
UNION ALL
  SELECT txt,
         id,
         REGEXP_SUBSTR( txt, '.*', pos + 1, 1 ),
         REGEXP_INSTR( txt, '.*', pos + 1, 1, 1 ),
         line_no + 1
  FROM   lines
  WHERE  pos > 0
),
words ( id, line, line_no, section_start, section_end, section_value ) AS (
  SELECT id,
         line,
         line_no,
         REGEXP_INSTR( line, '(\S)\1*', 1, 1, 0 ),
         REGEXP_INSTR( line, '(\S)\1*', 1, 1, 1 ) - 1,
         REGEXP_SUBSTR( line, '(\S)\1*', 1, 1, NULL, 1 )
  FROM   lines
  WHERE  pos > 0
  AND    line IS NOT NULL
UNION ALL
  SELECT id,
         line,
         line_no,
         REGEXP_INSTR( line, '(\S)\1*', section_end + 1, 1, 0 ),
         REGEXP_INSTR( line, '(\S)\1*', section_end + 1, 1, 1 ) - 1,
         REGEXP_SUBSTR( line, '(\S)\1*', section_end + 1, 1, NULL, 1 )
  FROM   words
  WHERE  section_end > 0
)
SELECT id,
       line_no,
       section_start,
       section_end,
       section_value
FROM   words
WHERE  section_end > 0
ORDER BY id, line_no, section_start

So, for the input data (with an added id column to be able to easily differentiate the pieces of text):

create table t (id NUMBER(5,0), txt varchar (1000));

insert into t (id, txt) values
(
1,
'
AAAAAAAAAAAAAAAAAAAAAAAAAAAA  BBBB    CCCCCCCCCCCCCCCCCCCCCCCCC
DDDE  FFFFFFFF    GGGGGGGGG               HHHHHHHH    IIIIIII
JJ      KKKLLL       MM NN                              OOOOO
            P                                              QQ
'
);

insert into t (id, txt) values ( 2, 'RRRSTT UUU    V WXYZ' );

This outputs:

ID | LINE_NO | SECTION_START | SECTION_END | SECTION_VALUE
-: | ------: | ------------: | ----------: | :------------
 1 |       2 |             1 |          28 | A            
 1 |       2 |            31 |          34 | B            
 1 |       2 |            39 |          63 | C            
 1 |       3 |             1 |           3 | D            
 1 |       3 |             4 |           4 | E            
 1 |       3 |             7 |          14 | F            
 1 |       3 |            19 |          27 | G            
 1 |       3 |            43 |          50 | H            
 1 |       3 |            55 |          61 | I            
 1 |       4 |             1 |           2 | J            
 1 |       4 |             9 |          11 | K            
 1 |       4 |            12 |          14 | L            
 1 |       4 |            22 |          23 | M            
 1 |       4 |            25 |          26 | N            
 1 |       4 |            57 |          61 | O            
 1 |       5 |            13 |          13 | P            
 1 |       5 |            60 |          61 | Q            
 2 |       1 |             1 |           3 | R            
 2 |       1 |             4 |           4 | S            
 2 |       1 |             5 |           6 | T            
 2 |       1 |             8 |          10 | U            
 2 |       1 |            15 |          15 | V            
 2 |       1 |            17 |          17 | W            
 2 |       1 |            18 |          18 | X            
 2 |       1 |            19 |          19 | Y            
 2 |       1 |            20 |          20 | Z            

db<>fiddle here

MT0
  • 143,790
  • 11
  • 59
  • 117