Friday, September 23, 2016

Netezza: transpose delimited string into table rows e.g. split delimited character field into records

Related posts: 

Have this




Need this



















SOLUTION:  This will transpose delimited record to table rows

SELECT c.cnt                                                           AS POSITION_IN_STRING_ARRAY
      ,SQLX..REGEXP_EXTRACT(a.longstring,'[^|]+',1,cast(c.cnt as int)) AS GET_VALUE_EVERY_ELEMENT
FROM (
              select '201|202|203|204|205|206|207|208|209|210|211|212|213|214|215' as longstring
       ) a
cross join (select row_number () OVER (order by 1) as cnt
              from (SELECT 1 FROM HISTDB.."$hist_log_entry_3" LIMIT 70) v
           )c
WHERE GET_VALUE_EVERY_ELEMENT IS NOT NULL




EXAMPLE AND DETAILS 

This is real problem. Split pipe delimited string into individual elements: one table record for each element.



SOLUTION #1: Using  Netezza SQL Toolkit array functions  array_split () and get_value_varchar () .

Proof of concept: 

SELECT
 sqlx..array_split(STR,'|')                                 as THIS_IS_ARRAY
,sqlx..get_value_varchar(sqlx..array_split(STR,'|'), NULL as GET_NULL
,sqlx..get_value_varchar(sqlx..array_split(STR,'|'),  1  )  as GET_01  --ARR starts w. 1
,sqlx..get_value_varchar(sqlx..array_split(STR,'|'),  2  )  as GET_02
,sqlx..get_value_varchar(sqlx..array_split(STR,'|'), 15  )  as GET_15
,sqlx..array_count(sqlx..array_split(STR,'|'))              as ARRAY_COUNT
--,sqlx..get_value_varchar(sqlx..array_split(STR,'|'), 16 ) as GET_NONEXIST 
--- ERROR [HY000] ERROR:  0 : Invalid Index Id
--,sqlx..get_value_varchar(sqlx..array_split(STR,'|'), '1') as GET_STRING   
--- ERROR [HY000] ERROR:  0 : Invalid Index Name
  from (
       select '201|202|203|204|205|206|207|208|209|210|211|212|213|214|215' as STR) AS F
;

Putting this all together

select F.ROWID AS RECID                    --- Original source records PK or unique identifier.            
,c.CNT        AS POSITION_IN_STRING_ARRAY --- Position of elemen in the field array (optional).
,sqlx..get_value_varchar(ARR,CAST(DECODE(SIGN(LEN-c.CNT),1,c.CNT,null) AS INT) ) AS GET_VALUE_EVERY_ELEMENT  
FROM  --- CAST is a must otherwise GET_VALUE_VARCHAR fails with:  "ERROR: Invalid Index Name"
(select ROWID                                                          --- Get ROWID or replace it with PK column
      ,sqlx..array_split(LONGSTRING,'|')                    AS ARR   --- Parse string into array only once
      ,sqlx..array_count(sqlx..array_split(LONGSTRING,'|')) AS LEN   --- Get lenth of this field to be used later
  from TESTDB.LONGSTRING
) AS --- Generate sequence from 1..N. Where N is the longest array in the field CNTRCT_SPEED
cross join (select row_number () OVER (order by 1) as cnt             
                 FROM (SELECT 1 FROM TESTDB.LONGSTRING limit 70 ) v
) AS C --- Eleminate non-existing array elements. They still in Cartesian Product (1..70) where there is no match.
where DECODE(SIGN(LEN-c.CNT),1,c.CNT,null) is not null
;




SOLUTION #2: Using  Netezza SQL Toolkit regular expression function regexp_extract () .

Proof of concept: 

 SELECT
 sqlx..regexp_match_count(STR,'[^|]+')      as MATCH_COUNT
,sqlx..regexp_extract(STR,'[^|]+',1,NULL)   as GET_NULL
,sqlx..regexp_extract(STR,'[^|]+',1,1)      as GET_01
,sqlx..regexp_extract(STR,'[^|]+',1,2)       as GET_02
,sqlx..regexp_extract(STR,'[^|]+',1,15)     as GET_15
,sqlx..regexp_extract(STR,'[^|]+',1,16)     as GET_GET_16
,sqlx..regexp_extract(STR,'[^|]+',1,222)    as GET_NONEXIST
---,sqlx..regexp_extract(STR,'[^|]+',1,0)   as GET_ZERO   
--- ERROR [HY000] ERROR:  0 : Regex: Illegal Reference Value 0
---,sqlx..regexp_extract(STR,'[^|]+',1,'1') as GET_STRING 
--- ERROR [HY000] ERROR:  0 : Regex: Illegal Option (1)
  from (
       select '201|202|203|204|205|206|207|208|209|210|211|212|213|214|215' as STR) AS F
;


Putting this all together
select a.ROWID AS RECID                  --- Original source records PK or unique identifier. 
      ,c.cnt AS POSITION_IN_STRING_ARRAY --- Position of elemen in the field array (optional).
      ,sqlx..regexp_extract(LONGSTRING,'[^|]+',1,cast(c.cnt as int)) GET_VALUE_EVERY_ELEMENT
from (select ROWID, * from TESTDB.LONGSTRING
) a --- Generate sequence from 1..N. Where N is the longest array in the field CNTRCT_SPEED
cross join (select row_number () OVER (order by 1) as cnt    
                 FROM (SELECT 1 FROM TESTDB.LONGSTRING  limit 70 ) v ) c
--- Eliminate non-existing elements. They still listed in Cartesian Product (1..70) where there is no match.
WHERE GET_VALUE_EVERY_ELEMENT IS NOT NULL
;




PERFORMANCE COMPARISON:

Notice,  limiting depth of search in crossjoin  ... limit70 will reduce the data explosion and improve performance.  This is tested at TF6 with 14 million record original source table.



SOLUTION #1array_split ()  and get_value_varchar () .
/*  LIMIT 5,000,000 Duration 19.1540956 sec. Records Affected:   6,605,274. */
/* LIMIT 10,000,000 Duration 52.2369878 sec. Records Affected:  43,817,113. */
/* LIMIT 12,000,000 Duration 45.5606059 sec. Records Affected:  79,234,929.*/
/* LIMIT 13,000,000 Duration 49.7078431 sec. Records Affected:  98,032,515. */
/* LIMIT 14,000,000 Duration 50.1268671 sec. Records Affected: 106,848,663.*/


SOLUTION #2:  regexp_extract () .
/* LIMIT  5,000,000 Duration  131.9048611 sec. Records Affected:    6,605,274. */
/* LIMIT 10,000,000 Duration  683.4229685 sec. Records Affected:   43,817,113. */
/* LIMIT 12,000,000 Duration 1175.0658919 sec. Records Affected:   79,234,929. */
/* LIMIT 13,000,000 Duration 1433.8532871 sec. Records Affected:   98,032,515. */
/* LIMIT 14,000,000 Duration 1562.2020755 sec. Records Affected:  106,848,663. */


Netezza: search for string in all database tables. Parse all text fields.

PROBLEM

Search every field in the database searching for a string pattern.

SOLUTION
This will generate long SQL statement what will include all existing text columns for all tables within confines of the CURRENT_CATALOG database. 

select 'union all select '''||CURRENT_CATALOG||''' DB,'''||NAME||''' TAB,'''||ATTNAME||''' COL'
||',sum(sign(nvl(instr('||ATTNAME||',''My-String#1-Here''),0))) CNTROWS_STR1 '
||',sum(sign(nvl(instr('||ATTNAME||',''My-String#2-Here''),0))) CNTROWS_STR2 '
||',sum(sign(nvl(instr('||ATTNAME||',''My-String#N-Here''),0))) CNTROWS_STRN '
||' from '||CURRENT_CATALOG||'..'||NAME
from _v_relation_column
where OBJCLASS = 4905 --- TABLE
  and ATTTYPID in (
                 18   --- CHAR
                ,19   --- NAME
                ,25   --- TEXT
               ,1042  --- CHARACTER
               ,1043  --- CHARACTER VARYING
               ,2530  --- NATIONAL CHARACTER VARYING
               ,2522  --- NATIONAL CHARACTER
) ORDER BY 1

;



EXAMPLE

 SELECT * FROM (
          select 'NZDBA' DB,'NZ_DB_SIZE_HISTORY' TAB,'DATABASE' COL
,sum(sign(nvl(instr(DATABASE,'A'),0))) CNTROWS_STR1
,sum(sign(nvl(instr(DATABASE,'B'),0))) CNTROWS_STR2
,sum(sign(nvl(instr(DATABASE,'C'),0))) CNTROWS_STRN
 from NZDBA..NZ_DB_SIZE_HISTORY
union all select 'NZDBA' DB,'NZ_DB_SIZE_HISTORY' TAB,'NAME' COL
,sum(sign(nvl(instr(NAME,'A'),0))) CNTROWS_STR1
,sum(sign(nvl(instr(NAME,'B'),0))) CNTROWS_STR2
,sum(sign(nvl(instr(NAME,'C'),0))) CNTROWS_STRN
 from NZDBA..NZ_DB_SIZE_HISTORY
union all select 'NZDBA' DB,'NZ_DB_SIZE_HISTORY' TAB,'OBJCLASS' COL
,sum(sign(nvl(instr(OBJCLASS,'A'),0))) CNTROWS_STR1
,sum(sign(nvl(instr(OBJCLASS,'B'),0))) CNTROWS_STR2
,sum(sign(nvl(instr(OBJCLASS,'C'),0))) CNTROWS_STRN
 from NZDBA..NZ_DB_SIZE_HISTORY
union all select 'NZDBA' DB,'NZ_DB_SIZE_HISTORY' TAB,'SCHEMA' COL
,sum(sign(nvl(instr(SCHEMA,'A'),0))) CNTROWS_STR1
,sum(sign(nvl(instr(SCHEMA,'B'),0))) CNTROWS_STR2
,sum(sign(nvl(instr(SCHEMA,'C'),0))) CNTROWS_STRN
 from NZDBA..NZ_DB_SIZE_HISTORY
union all select 'NZDBA' DB,'NZ_GRA_HISTORY' TAB,'GROUPNAME' COL
,sum(sign(nvl(instr(GROUPNAME,'A'),0))) CNTROWS_STR1
,sum(sign(nvl(instr(GROUPNAME,'B'),0))) CNTROWS_STR2
,sum(sign(nvl(instr(GROUPNAME,'C'),0))) CNTROWS_STRN
 from NZDBA..NZ_GRA_HISTORY
 --- ... many more here ... ---
) as x
WHERE (CNTROWS_STR1+CNTROWS_STR2+CNTROWS_STR2) > 0  --- Filter added
ORDER BY 1,2,3                                      --- to discard no matched
;