Tuesday, March 27, 2018

Hive: converting Netezza datatype to Hive datatypes




PROBLEM
Dynamically create target Hive tables and load them with data exported from Netezza. 

SOLUTION
To create Hive tables dynamically you need to convert Netezza to Hive datatypes, then proceed with the load.  Here is a datatype conversion table: 

SELECT
DISTINCT
FORMAT_TYPE as NZ_ORIG_DATATYPE,
SQLX..REPLACE(
SQLX..REPLACE(
SQLX..REPLACE(
SQLX..REPLACE(
SQLX..REPLACE(
SQLX..REPLACE(
SQLX..REPLACE(
SQLX..REPLACE(
SQLX..REPLACE(FORMAT_TYPE
,'CHARACTER VARYING' ,'VARCHAR'      )
,'NATIONAL VARCHAR'  ,'VARCHAR'      )
,'REAL'              ,'FLOAT'        )
,'NUMERIC(38,0)'     ,'BIGINT'       )
,'NATIONAL CHARACTER','CHAR'         )
,'INTEGER'           ,'INT'          )
,'NUMERIC'           ,'DECIMAL'      )
,'BYTEINT'           ,'TINYINT'      )
,'DOUBLE PRECISION' ,'DECIMAL(14,10)')
FROM _v_relation_column
WHERE OBJCLASS=4905;

How to identify max precision scale of factional numeric datatypes

PROBLEM:
This is to prevent data loss of fractional datatypes precision during the data transfer into target table, which otherwise will implicit happen. E.g. precision of a decimal datatype will be reduced to fit source value into the target field.
For data migration from source system to another target database, we need to identify precision (p) and scale (s) in of DOUBLE PRECISION, REAL, or FLOAT datatype in order to create target table with explicitly stated datatype DECIMAL (p,s).  

SOLUTION
In this case we migrating data from Netezza NPS 7.2 to Hortonworks Hive 1.2 Main idea is to convent fractional part of a number to string and count its length:

select
  max(
    length(
        trim(
          trim(
             to_char(
                  DECIMAL_COL01-int8(DECIMAL_COL01)  /* <-         chop off integers part:    1234.56789 - 1234 = .56789                   */
             ,'9.999999999999999999999999999999')    /* <-         append char with zeros:    .56789 -> 0.567890000000000000000000000000   */
            )                                        /* <-              trim blank spaces:                                                 */
      ,'0')                                          /* <-  trim leading and trailing '0':    .56789                                       */
   )-1)                                              /* <- lenght of a string without '.':                                                 */
as MAX01 from TESTDB..MYNUMERICS;


PUTTING THIS ALL TOGETHER:
/* Source table definition */
CREATE TABLE TESTDB..MYNUMERICS
(
       DECIMAL_COL01 DOUBLE PRECISION,
       DECIMAL_COL02 DOUBLE PRECISION,
       DECIMAL_COL03 DOUBLE PRECISION,
       DECIMAL_COL04 DOUBLE PRECISION,
       DECIMAL_COL05 DOUBLE PRECISION,
       DECIMAL_COL06 DOUBLE PRECISION,
       DECIMAL_COL07 DOUBLE PRECISION,
       DECIMAL_COL08 DOUBLE PRECISION,
       DECIMAL_COL09 DOUBLE PRECISION,
       DECIMAL_COL10 DOUBLE PRECISION,
       DECIMAL_COL11 DOUBLE PRECISION,
       DECIMAL_COL12 DOUBLE PRECISION
) DISTRIBUTE ON RANDOM;

SELECT * FROM TESTDB..MYNUMERICS ORDER BY RANDOM() LIMIT 1000;



Get maximum scale (s) for all fractional columns:
/* Get lenght of FRACTIONAL PART part, e.g. how many digits precision after decimal point */
select
  max(length(trim(trim(to_char(DECIMAL_COL01-int8(DECIMAL_COL01),'9.999999999999999999999999999999')),'0'))-1) as MAX01 --- DOUBLE PRECISION,
,max(length(trim(trim(to_char(DECIMAL_COL02-int8(DECIMAL_COL02),'9.999999999999999999999999999999')),'0'))-1) as MAX02 --- DOUBLE PRECISION,
,max(length(trim(trim(to_char(DECIMAL_COL03-int8(DECIMAL_COL03),'9.999999999999999999999999999999')),'0'))-1) as MAX03 --- DOUBLE PRECISION,
,max(length(trim(trim(to_char(DECIMAL_COL04-int8(DECIMAL_COL04),'9.999999999999999999999999999999')),'0'))-1) as MAX04 --- DOUBLE PRECISION,
,max(length(trim(trim(to_char(DECIMAL_COL05-int8(DECIMAL_COL05),'9.999999999999999999999999999999')),'0'))-1) as MAX05 --- DOUBLE PRECISION,
,max(length(trim(trim(to_char(DECIMAL_COL06-int8(DECIMAL_COL06),'9.999999999999999999999999999999')),'0'))-1) as MAX06 --- DOUBLE PRECISION,
,max(length(trim(trim(to_char(DECIMAL_COL07-int8(DECIMAL_COL07),'9.999999999999999999999999999999')),'0'))-1) as MAX07 --- DOUBLE PRECISION,
,max(length(trim(trim(to_char(DECIMAL_COL08-int8(DECIMAL_COL08),'9.999999999999999999999999999999')),'0'))-1) as MAX08 --- DOUBLE PRECISION,
,max(length(trim(trim(to_char(DECIMAL_COL09-int8(DECIMAL_COL09),'9.999999999999999999999999999999')),'0'))-1) as MAX09 --- DOUBLE PRECISION,
,max(length(trim(trim(to_char(DECIMAL_COL10-int8(DECIMAL_COL10),'9.999999999999999999999999999999')),'0'))-1) as MAX10 --- DOUBLE PRECISION,
,max(length(trim(trim(to_char(DECIMAL_COL11-int8(DECIMAL_COL11),'9.999999999999999999999999999999')),'0'))-1) as MAX11 --- DOUBLE PRECISION,
,max(length(trim(trim(to_char(DECIMAL_COL12-int8(DECIMAL_COL12),'9.999999999999999999999999999999')),'0'))-1) as MAX12 --- DOUBLE PRECISION,
from TESTDB..MYNUMERICS;
Get maximum precision (p) for each column in my this data set. 
/* Get lenght of WHOLE NUMBER part*/
select                                           
 max(length(trim(to_char(int8(DECIMAL_COL01),'999999999999999999999999999999')))) as MAX01 --- DOUBLE PRECISION,
,max(length(trim(to_char(int8(DECIMAL_COL02),'999999999999999999999999999999')))) as MAX02 --- DOUBLE PRECISION,
,max(length(trim(to_char(int8(DECIMAL_COL03),'999999999999999999999999999999')))) as MAX03 --- DOUBLE PRECISION,
,max(length(trim(to_char(int8(DECIMAL_COL04),'999999999999999999999999999999')))) as MAX04 --- DOUBLE PRECISION,
,max(length(trim(to_char(int8(DECIMAL_COL05),'999999999999999999999999999999')))) as MAX05 --- DOUBLE PRECISION,
,max(length(trim(to_char(int8(DECIMAL_COL06),'999999999999999999999999999999')))) as MAX06 --- DOUBLE PRECISION,
,max(length(trim(to_char(int8(DECIMAL_COL07),'999999999999999999999999999999')))) as MAX07 --- DOUBLE PRECISION,
,max(length(trim(to_char(int8(DECIMAL_COL08),'999999999999999999999999999999')))) as MAX08 --- DOUBLE PRECISION,
,max(length(trim(to_char(int8(DECIMAL_COL09),'999999999999999999999999999999')))) as MAX09 --- DOUBLE PRECISION,
,max(length(trim(to_char(int8(DECIMAL_COL10),'999999999999999999999999999999')))) as MAX10 --- DOUBLE PRECISION,
,max(length(trim(to_char(int8(DECIMAL_COL11),'999999999999999999999999999999')))) as MAX11 --- DOUBLE PRECISION,
,max(length(trim(to_char(int8(DECIMAL_COL12),'999999999999999999999999999999')))) as MAX12 --- DOUBLE PRECISION,
from TESTDB..MYNUMERICS;

Outcome
The DOUBLE PRECISION columns in this data set is safe to set as DECIMAL(12,10) in target Hive table to guarantee no data loss, which could implicitly happen during the data load transfer. 


RELATED LINKS:

Wednesday, March 14, 2018

Practical network speed test: 1 Gbit v.s. 10 Gbit with 1500 MTU v.s. 9000 MTU

QUESTION:
How much practical network throughput will gain converting from standard 1500 MTU size to  MTU 9000 (aka jumbo frame) at 10 Gbit network?

ANSWER
The empirical test shows ~ 3x improvement transferring data over 10 Gbit versus 1 Gbit network having standard MTU 1500.  Changing the 10 Gbit network interface to 9000 MTU gains additional 3x improvement over standard 1500 MTU. 



The speed test was conducted at different network conditions over several days of running iperf3 command. 

SOUND AND CLEAR:

The data heavy environment where large amount of data is moved across should expect ~3x improvement cranking up to 9000 MTU at 10 Gbit interface. Target applications are Big Data systems, database backups, data warehouses, large data analytic. Technically, the MTU has to be set to 8912 to account for 88 (42+46).

Useful commands:


/sbin/ifconfig bond3 mtu 1500 up   
/opt/network-tools/iperf3 -s
/sbin/ifconfig bond3 mtu 8912 up   ### Setup MTU here


Set up config permanently in the interface startup script:
 
[root@serverX~]# cat /etc/sysconfig/network-scripts/ifcfg-bond0
DEVICE="bond0"
BOOTPROTO=none
IPADDR=10.20.30.99
NETMASK=255.255.255.192
ONBOOT="yes"
USERCTL=no
TXQUEUELEN=100000
MTU=8912
BONDING_OPTS="mode=1 miimon=200 primary=eth0"




Related links:


Tuesday, March 13, 2018

Netezza: create local synonyms for SQL Extention Toolkit

PROBLEM
 Improve SQL and NZPLSQL code by eliminating full path references to for SQL Toolkit functions installed in customary named locations: <database>.<schema>.<functionname>. Instead, by calling local <functionname> SQL Function name in the context of a current database to execute this functions and aggregates even though, they are installed in another separate database. Make it standard, portable, and convenient. Consider this as a best-practice intended to save a lot of hassle on the long run for you.

SOLUTION:
Create a database local synonyms that point to functions and aggregates installed in a another separate database. 
Benefits of this best practice method: 
  • keep only one version of SQL Toolkit installed per appliance. Install a SQL Toolkit version only once e.g. not in each database.
  • toggle between different versions of SQL Toolkit by rebuilding synonyms to point to a version of SQL tookit you need.
  • keep SQL/NZPLSQL code portable and standard by keeping code clean off custom database names. Only local standard function calls.  

/* Generate SQL commands to create synonyms */
set catalog SQL_EXTENSIONS;
select * from (
       select distinct 'create synonym '||function ||' for '||current_catalog||'.ADMIN.'||function ||';' from _v_function  where database = current_catalog union all
       select distinct 'create synonym '||aggregate||' for '||current_catalog||'.ADMIN.'||aggregate||';' from _v_aggregate where database = current_catalog union all
       select distinct 'alter synonym ||function ||' owner to ADMIN;'            from _v_function  where database = current_catalog union all
       select distinct 'alter synonym '||aggregate||' owner to ADMIN;'            from _v_aggregate where database = current_catalog
) as F
order by 1 desc;


/* Run the newly generated list of commands in the target database  */
SET CURRENT CATALOG MY_TARGET_DB;  --- Set target DB here where you want to create synonyms at ---
create synonym YEAR for SQL_EXTENSIONS.ADMIN.YEAR;
create synonym XMLUPDATE for SQL_EXTENSIONS.ADMIN.XMLUPDATE;
create synonym XMLSERIALIZE for SQL_EXTENSIONS.ADMIN.XMLSERIALIZE;
create synonym XMLROOT for SQL_EXTENSIONS.ADMIN.XMLROOT;
create synonym XMLPARSE for SQL_EXTENSIONS.ADMIN.XMLPARSE;
create synonym XMLEXTRACTVALUE for SQL_EXTENSIONS.ADMIN.XMLEXTRACTVALUE;
create synonym XMLEXTRACT for SQL_EXTENSIONS.ADMIN.XMLEXTRACT;
create synonym XMLEXISTSNODE for SQL_EXTENSIONS.ADMIN.XMLEXISTSNODE;
create synonym XMLELEMENT for SQL_EXTENSIONS.ADMIN.XMLELEMENT;
create synonym XMLCONCAT for SQL_EXTENSIONS.ADMIN.XMLCONCAT;
     ..... 99 create + 99 alter statements ....  
alter synonym ARRAY_COUNT owner to ADMIN;
alter synonym ARRAY_CONCAT owner to ADMIN;
alter synonym ARRAY_COMBINE owner to ADMIN;
alter synonym ARRAY owner to ADMIN;
alter synonym ADD_ELEMENT owner to ADMIN;


/* Rendered list of 199 SQL commands. Run them all in the target DB */


After running the generated code you must end up exactly with 99 synonyms (as of 2018-03-13 SQL Toolkit  v7.2.1.3)

Related posts: