DBA Data[Home] [Help]

PACKAGE: SYS.DBMS_DATA_MINING_TRANSFORM

Source


1 PACKAGE dbms_data_mining_transform AUTHID CURRENT_USER AS
2   -----------------------------------------------------------------------------
3   -- COLUMN LIST collection type
4   -----------------------------------------------------------------------------
5   -- This type is used to store both quoted and non-quoted identifiers for
6   -- column names of a table.
7   TYPE
8     COLUMN_LIST                    IS VARRAY(1000) OF VARCHAR2(32);
9 
10   -----------------------------------------------------------------------------
11   -- DESCribe LIST collection TYPE
12   -----------------------------------------------------------------------------
13   -- This type holds the results of describe operations. Note that col_name
14   -- field of DESC_TAB2 is VARCHAR2(32767). This is applicable to cases when a
15   -- SELECT expression is not given an alias and thus the expression itself is
16   -- used as name (with whitespaces removed) and thus can be larger than 30
17   -- bytes. However when those names get larger than 30 bytes the corresponding
18   -- SELECT cannot be used as an inline view.
19   SUBTYPE
20     DESCRIBE_LIST                  IS dbms_sql.DESC_TAB2;
21 
22   SUBTYPE
23     DESCRIBE_LIST3                 IS dbms_sql.DESC_TAB3;
24 
25   -----------------------------------------------------------------------------
26   -- EXPRESSION RECord type
27   -----------------------------------------------------------------------------
28   -- This type is used for storing transformation expressions. Unlike VARCHAR2
29   -- it can be used for expression that can grow larger than 32K. Field lb is
30   -- the smallest index and ub is the largest index of the elements of the
31   -- VARCHAR2A array.
32   TYPE
33     EXPRESSION_REC                 IS RECORD (
34       lstmt                          dbms_sql.VARCHAR2A,
35       lb                             BINARY_INTEGER DEFAULT 1,
36       ub                             BINARY_INTEGER DEFAULT 0);
37 
38   -----------------------------------------------------------------------------
39   -- TRAMSFORMation RECord type
40   -----------------------------------------------------------------------------
41   -- This type is used for storing in-memory transformation stack definition
42   -- for a single attribute.
43   -- Attribute specification field is used to specify additional information
44   -- and actions for an attribute:
45   --   NOPREP - disables data prep for an attribute.
46   --            This is applicable to both stack_* interfaces in this package
47   --            as well as dbms_data_mining.create_model. Stack_* methods do
48   --            not do any stacking for attributes with NOPREP specified, while
49   --            create_model does not do any auto data prep even when the
50   --            global PREP_AUTO setting is set to ON.
51   TYPE
52     TRANSFORM_REC                  IS RECORD (
53       attribute_name                 VARCHAR2(30),
54       attribute_subname              VARCHAR2(4000),
55       expression                     EXPRESSION_REC,
56       reverse_expression             EXPRESSION_REC,
57       attribute_spec                 VARCHAR2(4000));
58 
59   -----------------------------------------------------------------------------
60   -- TRAMSFORMation LIST collection type
61   -----------------------------------------------------------------------------
62   -- This type is used for storing in-memory transformation stack definition.
63   TYPE
64     TRANSFORM_LIST                 IS TABLE OF TRANSFORM_REC;
65 
66   -----------------------------------------------------------------------------
67   -- NESTed COLumn TYPEs
68   -----------------------------------------------------------------------------
69   nest_num_col_type              CONSTANT NUMBER := 100001;
70   nest_cat_col_type              CONSTANT NUMBER := 100002;
71   nest_bd_col_type               CONSTANT NUMBER := 100003;
72   nest_bf_col_type               CONSTANT NUMBER := 100004;
73 
74   nest_nums_col_name       CONSTANT VARCHAR2(30) := 'DM_NESTED_NUMERICALS';
75   nest_cats_col_name       CONSTANT VARCHAR2(30) := 'DM_NESTED_CATEGORICALS';
76   nest_bds_col_name        CONSTANT VARCHAR2(30) := 'DM_NESTED_BINARY_DOUBLES';
77   nest_bfs_col_name        CONSTANT VARCHAR2(30) := 'DM_NESTED_BINARY_FLOATS';
78 
79   -----------------------------------------------------------------------------
80   --                            create_col_rem                               --
81   -----------------------------------------------------------------------------
82   -- NAME
83   --   create_col_rem - CREATE COLumn REMoval definition table
84   -- DESCRIPTION
85   --   Creates column removal definition table:
86   --       CREATE TABLE <col_rem>(
87   --         col   VARCHAR2(30),
88   --         att   VARCHAR2(4000))
89   --   This table is used to guide query generation process that removes
90   --   specified columns.
91   -- PARAMETERS
92   --   rem_table_name                 - column removal definition table
93   --   rem_schema_name                - definition table schema name
94   -- RETURNS
95   --   None
96   -- EXCEPTIONS
97   --   None
98   -- NOTES
99   --   Note, that {col} is case sensitive since it generates quoted
100   --   identifiers. It is allowed to have multiple entries in the xform
101   --   defintion table for the same {col}.
102   -----------------------------------------------------------------------------
103   PROCEDURE create_col_rem(
104     rem_table_name                 VARCHAR2,
105     rem_schema_name                VARCHAR2 DEFAULT NULL);
106 
107   -----------------------------------------------------------------------------
108   --                              xform_col_rem                              --
109   -----------------------------------------------------------------------------
110   -- NAME
111   --   xform_col_rem - XFORMation COLumn REMoval
112   -- DESCRIPTION
113   --   Creates a view that perfoms column removal from the data table. Only the
114   --   columns that are specified in the xform definition are removed, the
115   --   remaining columns do not change.
116   -- PARAMETERS
117   --   rem_table_name                 - xform definition table
118   --   data_table_name                - data table
119   --   xform_view_name                - xform view
120   --   rem_schema_name                - xform definition table schema name
121   --   data_schema_name               - data table schema name
122   --   xform_schema_name              - xform view schema name
123   -- RETURNS
124   --   None
125   -- EXCEPTIONS
126   --   None
127   -- NOTES
128   --   None
129   -----------------------------------------------------------------------------
130   PROCEDURE xform_col_rem(
131     rem_table_name                 VARCHAR2,
132     data_table_name                VARCHAR2,
133     xform_view_name                VARCHAR2,
134     rem_schema_name                VARCHAR2 DEFAULT NULL,
135     data_schema_name               VARCHAR2 DEFAULT NULL,
136     xform_schema_name              VARCHAR2 DEFAULT NULL);
137 
138   -----------------------------------------------------------------------------
139   --                              stack_col_rem                              --
140   -----------------------------------------------------------------------------
141   -- NAME
142   --   stack_col_rem - STACK COLumn REMoval
143   -- DESCRIPTION
144   --   For every column in the stack definition that has a matching entry in
145   --   the transformation definition, stacks column removal expression (NULL
146   --   expression) instead of the existing expression and updates the stack
147   --   definition. Columns in the stack definition that have no matching
148   --   entries in the transformation definition or have NULL expression are not
149   --   changed. Columns in the transformation definition that have no matching
150   --   entries in the stack definition are added to the stack definition (with
151   --   NULL expression). Reverse expressions in the stack definition are
152   --   updated accordingly, that is if an expression is updated, added or
153   --   remains unchanged then a corresponding reverse expression is also
154   --   updated, added or remains unchanged. When reverse expression is NULL it
155   --   is treated as "{attr}" (identity function) or "VALUE" for nested
156   --   columns. Given an entry ({attr}, {expr}, {rexp}) in the stack definition
157   --   and a matching entry in the transformation definition, after stacking
158   --   {expr} and {rexp} are both changed to NULL.
159   --   Example 1. Given transformation definition:
160   --       {col = COL1, att = NULL}
161   --       {col = COL2, att = NULL}
162   --   and stack definition:
163   --       {attr = COL1, expr = log(10, COL1), rexp = power(10, COL1)}
164   --       {attr = COL3, expr = ln(COL3),      rexp = exp(COL3)}
165   --   the following updated stack definition is generated:
166   --       {attr = COL1, expr = NULL,     rexp = NULL}
167   --       {attr = COL3, expr = ln(COL3), rexp = exp(COL3)}
168   --       {attr = COL2, expr = NULL,     rexp = NULL}
169   -- PARAMETERS
170   --   rem_table_name                 - xform definition table
171   --   xform_list                     - stack definition
172   --   rem_schema_name                - xform definition table schema name
173   -- RETURNS
174   --   xform_list                     - updated stack definition
175   -- EXCEPTIONS
176   --   None
177   -- NOTES
178   --   None
179   -----------------------------------------------------------------------------
180   PROCEDURE stack_col_rem(
181     rem_table_name                 VARCHAR2,
182     xform_list                     IN OUT NOCOPY TRANSFORM_LIST,
183     rem_schema_name                VARCHAR2 DEFAULT NULL);
184 
185   -----------------------------------------------------------------------------
186   --                            create_norm_lin                              --
187   -----------------------------------------------------------------------------
188   -- NAME
189   --   create_norm_lin - CREATE LINear NORMalization definition table
190   -- DESCRIPTION
191   --   Creates linear normalization definition table:
192   --       CREATE TABLE <norm_lin>(
193   --         col   VARCHAR2(30),
194   --         att   VARCHAR2(4000),
195   --         shift NUMBER,
196   --         scale NUMBER)
197   --   This table is used to guide query generation process to construct
198   --   linear normalization expressions of the following form:
199   --       ("{col}" - {shift})/{scale} "{col}"
200   --   For example when col = 'my_col', shift = -1.5 and scale = 20 the
201   --   following expression is generated:
202   --       ("my_col" - (-1.5))/20 "my_col"
203   -- PARAMETERS
204   --   norm_table_name                - linear normalization definition table
205   --   norm_schema_name               - definition table schema name
206   -- RETURNS
207   --   None
208   -- EXCEPTIONS
209   --   None
210   -- NOTES
211   --   Note, that {col} is case sensitive since it generates quoted
212   --   identifiers.  When there are multiple entries in the xform defintion
213   --   table for the same {col} the behavior is undefined.  Any one of the
214   --   definitions may be used in query generation. NULL values remain
215   --   unchanged.
216   -----------------------------------------------------------------------------
217   PROCEDURE create_norm_lin(
218     norm_table_name                VARCHAR2,
219     norm_schema_name               VARCHAR2 DEFAULT NULL);
220 
221   -----------------------------------------------------------------------------
222   --                          insert_norm_lin_minmax                         --
223   -----------------------------------------------------------------------------
224   -- NAME
225   --   insert_norm_lin_minmax - INSERT into LINear NORMalization MINMAX
226   -- DESCRIPTION
227   --   For every NUMBER column in the data table that is not in the exclusion
228   --   list finds normalization definition and inserts it into the definition
229   --   table. Definition for each relevant column is computed based on the min
230   --   and max values that are computed from the data table:
231   --       shift = min
232   --       scale = max - min
233   --   The values of shift and scale are rounded to round_num significant
234   --   digits prior to storing them in the definition table.
235   -- PARAMETERS
236   --   norm_table_name                - linear normalization definition table
237   --   data_table_name                - data table
238   --   exclude_list                   - column exclusion list
239   --   round_num                      - number of significant digits
240   --   norm_schema_name               - definition table schema name
241   --   data_schema_name               - data table schema name
242   -- RETURNS
243   --   None
244   -- EXCEPTIONS
245   --   None
246   -- NOTES
247   --   Columns with all NULLs or only one unique value are ignored.
248   -----------------------------------------------------------------------------
249   PROCEDURE insert_norm_lin_minmax(
250     norm_table_name                VARCHAR2,
251     data_table_name                VARCHAR2,
252     exclude_list                   COLUMN_LIST DEFAULT NULL,
253     round_num                      PLS_INTEGER DEFAULT 6,
254     norm_schema_name               VARCHAR2 DEFAULT NULL,
255     data_schema_name               VARCHAR2 DEFAULT NULL);
256 
257   -----------------------------------------------------------------------------
258   --                          insert_norm_lin_scale                          --
259   -----------------------------------------------------------------------------
260   -- NAME
261   --   insert_norm_lin_scale - INSERT into LINear NORMalization SCALE
262   -- DESCRIPTION
263   --   For every NUMBER column in the data table that is not in the exclusion
264   --   list finds normalization definition and inserts it into the definition
265   --   table. Definition for each relevant column is computed based on the min
266   --   and max values that are computed from the data table:
267   --       shift = 0
268   --       scale = greatest(abs(max), abs(min))
269   --   The value of scale is rounded to round_num significant digits prior to
270   --   storing it in the definition table.
271   -- PARAMETERS
272   --   norm_table_name                - linear normalization definition table
273   --   data_table_name                - data table
274   --   exclude_list                   - column exclusion list
275   --   round_num                      - number of significant digits
276   --   norm_schema_name               - definition table schema name
277   --   data_schema_name               - data table schema name
278   -- RETURNS
279   --   None
280   -- EXCEPTIONS
281   --   None
282   -- NOTES
283   --   Columns with all NULLs or all zeros are ignored.
284   -----------------------------------------------------------------------------
285   PROCEDURE insert_norm_lin_scale(
286     norm_table_name                VARCHAR2,
287     data_table_name                VARCHAR2,
288     exclude_list                   COLUMN_LIST DEFAULT NULL,
289     round_num                      PLS_INTEGER DEFAULT 6,
290     norm_schema_name               VARCHAR2 DEFAULT NULL,
291     data_schema_name               VARCHAR2 DEFAULT NULL);
292 
293   -----------------------------------------------------------------------------
294   --                          insert_norm_lin_zscore                         --
295   -----------------------------------------------------------------------------
296   -- NAME
297   --   insert_norm_lin_zscore - INSERT into LINear NORMalization Z-SCORE
298   -- DESCRIPTION
299   --   For every NUMBER column in the data table that is not in the exclusion
300   --   list finds normalization definition and inserts it into the definition
301   --   table. Definition for each relevant column is computed based on the
302   --   mean and standard deviation that are estimated from the data table:
303   --       shift = mean
304   --       scale = stddev
305   --   The values of shift and scale are rounded to round_num significant
306   --   digits prior to storing them in the definition table.
307   -- PARAMETERS
308   --   norm_table_name                - linear normalization definition table
309   --   data_table_name                - data table
310   --   exclude_list                   - column exclusion list
311   --   round_num                      - number of significant digits
312   --   norm_schema_name               - definition table schema name
313   --   data_schema_name               - data table schema name
314   -- RETURNS
315   --   None
316   -- EXCEPTIONS
317   --   None
318   -- NOTES
319   --   Columns with all NULLs or only one unique value are ignored.
320   -----------------------------------------------------------------------------
321   PROCEDURE insert_norm_lin_zscore(
322     norm_table_name                VARCHAR2,
323     data_table_name                VARCHAR2,
327     data_schema_name               VARCHAR2 DEFAULT NULL);
324     exclude_list                   COLUMN_LIST DEFAULT NULL,
325     round_num                      PLS_INTEGER DEFAULT 6,
326     norm_schema_name               VARCHAR2 DEFAULT NULL,
328 
329   -----------------------------------------------------------------------------
330   --                              xform_norm_lin                             --
331   -----------------------------------------------------------------------------
332   -- NAME
333   --   xform_norm_lin - XFORMation LINear NORMalization
334   -- DESCRIPTION
335   --   Creates a view that perfoms linear normalization of the data table
336   --   Only the columns that are specified in the xform definition are
337   --   normalized, the remaining columns do not change.
338   -- PARAMETERS
339   --   norm_table_name                - xform definition table
340   --   data_table_name                - data table
341   --   xform_view_name                - xform view
342   --   norm_schema_name               - xform definition table schema name
343   --   data_schema_name               - data table schema name
344   --   xform_schema_name              - xform view schema name
345   -- RETURNS
346   --   None
347   -- EXCEPTIONS
348   --   None
349   -- NOTES
350   --   None
351   -----------------------------------------------------------------------------
352   PROCEDURE xform_norm_lin(
353     norm_table_name                VARCHAR2,
354     data_table_name                VARCHAR2,
355     xform_view_name                VARCHAR2,
356     norm_schema_name               VARCHAR2 DEFAULT NULL,
357     data_schema_name               VARCHAR2 DEFAULT NULL,
358     xform_schema_name              VARCHAR2 DEFAULT NULL);
359 
360   -----------------------------------------------------------------------------
361   --                              stack_norm_lin                             --
362   -----------------------------------------------------------------------------
363   -- NAME
364   --   stack_norm_lin - STACK LINear NORMalization
365   -- DESCRIPTION
366   --   For every column in the stack definition that has a matching entry in
367   --   the transformation definition, stacks linear normalization expression on
368   --   top of the existing expression and updates the stack definition.
369   --   Columns in the stack definition that have no matching entries in the
370   --   transformation definition or have NULL expression are not changed.
371   --   Columns in the transformation definition that have no matching entries
372   --   in the stack definition are added to the stack definition (using "{col}"
373   --   in place of the original expression or "VALUE" for nested attributes).
374   --   Reverse expressions in the stack definition are updated accordingly,
375   --   that is if an expression is updated, added or remains unchanged then a
376   --   corresponding reverse expression is also updated, added or remains
377   --   unchanged. When reverse expression is NULL it is treated as identity
378   --   function ("{attr}" or "VALUE" for nested attributes).
379   --   Given an entry ({attr}, {expr}, {rexp}) in the stack definition and a
380   --   matching entry in the transformation definition, after stacking {expr}
381   --   has the following form:
382   --       ({expr} - {shift})/{scale}
383   --   and {rexp} maintains the following form with every occurance of {attr}
384   --   replaced with:
385   --       {attr}*{scale} + {shift}
386   --   Example 1. Given transformation definition:
387   --       {col = COL1, shift = -1.5, scale = 20}
388   --       {col = COL2, shift = 0,    scale = 10}
389   --   and stack definition:
390   --       {attr = COL1, expr = log(10, COL1), rexp = power(10, COL1)}
391   --       {attr = COL3, expr = ln(COL3),      rexp = exp(COL3)}
392   --   the following updated stack definition is generated:
393   --       {attr = COL1,
394   --        expr = (log(10, COL1) - (-1.5)) / 20,
395   --        rexp = power(10, COL1*20 + (-1.5))}
396   --       {attr = COL3,
397   --        expr = ln(COL3),
398   --        rexp = exp(COL3)}
399   --       {attr = COL2,
400   --        expr = (COL2 - 0) / 10,
401   --        rexp = COL2*10 + 0}
402   -- PARAMETERS
403   --   norm_table_name                - xform definition table
404   --   xform_list                     - stack definition
405   --   norm_schema_name               - xform definition table schema name
406   -- RETURNS
407   --   xform_list                     - updated stack definition
408   -- EXCEPTIONS
409   --   None
410   -- NOTES
411   --   None
412   -----------------------------------------------------------------------------
413   PROCEDURE stack_norm_lin(
414     norm_table_name                VARCHAR2,
415     xform_list                     IN OUT NOCOPY TRANSFORM_LIST,
416     norm_schema_name               VARCHAR2 DEFAULT NULL);
417 
418   -----------------------------------------------------------------------------
419   --                            create_bin_num                               --
420   -----------------------------------------------------------------------------
421   -- NAME
422   --   create_bin_num - CREATE NUMerical BINning definition table
423   -- DESCRIPTION
424   --   Creates numerical binning definition table:
425   --       CREATE TABLE <bin_num>(
426   --         col   VARCHAR2(30),
427   --         att   VARCHAR2(4000),
428   --         val   NUMBER,
429   --         bin   VARCHAR2(4000))
430   --   This table is used to guide query generation process to construct
431   --   numerical binning expressions of the following form:
432   --       CASE WHEN "{col}" <  {val0}   THEN '{bin0}'
436   --            WHEN "{col}" IS NOT NULL THEN '{bin(N+1)}'
433   --            WHEN "{col}" <= {val1}   THEN '{bin1}'
434   --            ...
435   --            WHEN "{col}" <= {valN}   THEN '{binN}'
437   --       END "{col}"
438   --   This expression maps values in the range [{val0};{valN}] into N bins
439   --   {bin1}, ..., {binN}, values outside of this range into {bin0} or
440   --   {bin(N+1)}, such that
441   --       (-inf; {val0})       -> {bin0}
442   --       [{val0}; {val1}]     -> {bin1}
443   --       ...
444   --       ({val(N-1)}; {valN}] -> {binN}
445   --       ({valN}; +inf)       -> {bin(N+1)}.
446   --   NULL values remain unchanged. {bin(N+1)} is optional. If it is not
447   --   specified the values ("{col}" > {valN}) are mapped to NULL. To specify
448   --   {bin(N+1)} provide a row with {val} = NULL. The order of the WHEN ..
449   --   THEN pairs is based on the ascending order of {val} for a given {col}.
450   --   Example 1. <bin_num> contains four rows with {col} = 'mycol':
451   --       {col = 'mycol', val = 15.5, bin = 'small'}
452   --       {col = 'mycol', val = 10,   bin = 'tiny'}
453   --       {col = 'mycol', val = 20,   bin = 'large'}
454   --       {col = 'mycol', val = NULL, bin = 'huge'}
455   --   the following expression is generated:
456   --       CASE WHEN "mycol" <  10       THEN 'tiny'
457   --            WHEN "mycol" <= 15.5     THEN 'small'
458   --            WHEN "mycol" <= 20       THEN 'large'
459   --            WHEN "mycol" IS NOT NULL THEN 'huge'
460   --       END "mycol"
461   --   Example 2. <bin_num> contains three rows with {col} = 'mycol':
462   --       {col = 'mycol', val = 15.5, bin = NULL}
463   --       {col = 'mycol', val = 10,   bin = 'tiny'}
464   --       {col = 'mycol', val = 20,   bin = 'large'}
465   --   the following expression is generated:
466   --       CASE WHEN "mycol" <  10   THEN NULL
467   --            WHEN "mycol" <= 15.5 THEN 'small'
468   --            WHEN "mycol" <= 20   THEN 'large'
469   --       END "mycol"
470   -- PARAMETERS
471   --   bin_table_name                 - numerical binning definition table
472   --   bin_schema_name                - definition table schema name
473   -- RETURNS
474   --   None
475   -- EXCEPTIONS
476   --   None
477   -- NOTES
478   --   Note, that {col} is case sensitive since it generates quoted
479   --   identifiers. In cases when there are multiple entries with the same
480   --   {col}, {val} combiniation with different {bin} the behavior is
481   --   undefined. Any one of the {bin} might be used. The maximum number of
482   --   arguments in a CASE expression is 255, and each WHEN ... THEN pair
483   --   counts as two arguments.
484   -----------------------------------------------------------------------------
485   PROCEDURE create_bin_num(
486     bin_table_name                 VARCHAR2,
487     bin_schema_name                VARCHAR2 DEFAULT NULL);
488 
489   -----------------------------------------------------------------------------
490   --                          insert_bin_num_eqwidth                         --
491   -----------------------------------------------------------------------------
492   -- NAME
493   --   insert_bin_num_eqwidth - INSERT into NUMerical BINning EQual-WIDTH
494   -- DESCRIPTION
495   --   For every NUMBER column in the data table that is not in the exclusion
496   --   list finds numerical binning definition and inserts it into the
497   --   definition table. Definition for each relevant column is computed based
498   --   on the min and max values that are computed from the data table. Each
499   --   of the N (p_bin_num) bins {bin1}, ..., {binN} span ranges of equal
500   --   width
501   --       inc = (max - min) / N
502   --   where {binI} = I when N > 0 or {binI} = N+1-I when N < 0, and
503   --   {bin0} = {bin(N+1)} = NULL. For example, when N=2, col='mycol', min=10,
504   --   and max = 21, the following three rows are inserted into the
505   --   definition table (inc = 5.5):
506   --       COL     VAL BIN
507   --       ----- ----- -----
508   --       mycol    10 NULL
509   --       mycol  15.5 1
510   --       mycol    21 2
511   --   The values of {val} are rounded to round_num significant digits prior
512   --   to storing them in the definition table.
513   -- PARAMETERS
514   --   bin_table_name                 - numerical binning definition table
515   --   data_table_name                - data table
516   --   bin_num                        - number of bins
517   --   exclude_list                   - column exclusion list
518   --   round_num                      - number of significant digits
519   --   bin_schema_name                - definition table schema name
520   --   data_schema_name               - data table schema name
521   -- RETURNS
522   --   None
523   -- EXCEPTIONS
524   --   None
525   -- NOTES
526   --   Columns with all NULLs or only one unique value are ignored. Nothing
527   --   is done when bin_num IS NULL or bin_num = 0.
528   -----------------------------------------------------------------------------
529   PROCEDURE insert_bin_num_eqwidth(
530     bin_table_name                 VARCHAR2,
531     data_table_name                VARCHAR2,
532     bin_num                        PLS_INTEGER DEFAULT 10,
533     exclude_list                   COLUMN_LIST DEFAULT NULL,
534     round_num                      PLS_INTEGER DEFAULT 6,
535     bin_schema_name                VARCHAR2 DEFAULT NULL,
536     data_schema_name               VARCHAR2 DEFAULT NULL);
537 
538   -----------------------------------------------------------------------------
542   --   insert_bin_num_qtile - INSERT into NUMerical BINning QuanTILE
539   --                          insert_bin_num_qtile                           --
540   -----------------------------------------------------------------------------
541   -- NAME
543   -- DESCRIPTION
544   --   For every NUMBER column in the data table that is not in the exclusion
545   --   list finds numerical binning definition and inserts it into the
546   --   definition table. Definition for each relevant column is computed based
547   --   on the min values per quantile, where quantiles are computed from the
548   --   data using NTILE function. Bin {bin1} spans range [min(1), min(2)],
549   --   bins {bin2}, ..., {bin(N-1)} span ranges (min(I), min(I+1)] and {binN}
550   --   range (min(N), max(N)] with {binI} = I when N > 0 or {binI} = N+1-I
551   --   when N < 0, and {bin0}={bin(N+1)} = NULL. Bins with equal left and
552   --   right boundaries are collapsed. For example, when N=4, col='mycol',
553   --   and data is {1,2,2,2,2,3,4}, the following three rows are inserted into
554   --   the definition table:
555   --       COL     VAL BIN
556   --       ----- ----- -----
557   --       mycol     1 NULL
558   --       mycol     2 1
559   --       mycol     4 2
560   --   Here quantiles are {1,2}, {2,2}, {2,3}, {4} and min(1) = 1, min(2) = 2,
561   --   min(3) = 2, min(4) = 4, max(4) = 4, and ranges are [1,2], (2,2], (2,4],
562   --   (4,4]. After collapsing [1,2], (2,4].
563   -- PARAMETERS
564   --   bin_table_name                 - numerical binning definition table
565   --   data_table_name                - data table
566   --   bin_num                        - number of bins
567   --   exclude_list                   - column exclusion list
568   --   bin_schema_name                - definition table schema name
569   --   data_schema_name               - data table schema name
570   -- RETURNS
571   --   None
572   -- EXCEPTIONS
573   --   None
574   -- NOTES
575   --   Columns with all NULLs or only one unique value are ignored. Nothing
576   --   is done when bin_num IS NULL or bin_num = 0.
577   -----------------------------------------------------------------------------
578   PROCEDURE insert_bin_num_qtile(
579     bin_table_name                 VARCHAR2,
580     data_table_name                VARCHAR2,
581     bin_num                        PLS_INTEGER DEFAULT 10,
582     exclude_list                   COLUMN_LIST DEFAULT NULL,
583     bin_schema_name                VARCHAR2 DEFAULT NULL,
584     data_schema_name               VARCHAR2 DEFAULT NULL);
585 
586   -----------------------------------------------------------------------------
587   --                      insert_autobin_num_eqwidth                         --
588   -----------------------------------------------------------------------------
589   -- NAME
590   --   insert_autobin_num_eqwidth - INSERT into NUMerical BINning AUTOmated
591   --                                EQual-WIDTH
592   -- DESCRIPTION
593   --   For every NUMBER column in the data table that is not in the exclusion
594   --   list finds numerical binning definition and inserts it into the
595   --   definition table. Definition for each relevant column is computed using
596   --   equal-width method (see description for insert_bin_nume_eqwidth). The
597   --   number of bins (N) is computed for each column separately and is based
598   --   on the number of non-NULL values (cnt), min and max values, and the
599   --   standard deviation (dev)
600   --       N = floor(power(cnt, 1/3)*(max - min)/(C*dev))
601   --   where C = 3.49/0.9. Parameter bin_num is used to adjust N to be at
602   --   least bin_num. No adjustment is done when bin_num is NULL or zero.
603   --   Parameter max_bin_num is used to adjust N to be at most max_bin_num.
604   --   No adjustment is done when max_bin_num is NULL or zero. For columns
605   --   with all integer values (discrete columns) N is adjusted to be at most
606   --   the maximum number of distinct values in the obseved range
607   --       max - min + 1
608   --   Parameter sample_size is used to adjust cnt to be at most sample_size.
609   --   No adjustment is done when sample_size is NULL or zero.
610   -- PARAMETERS
611   --   bin_table_name                 - numerical binning definition table
612   --   data_table_name                - data table
613   --   bin_num                        - minimum number of bins
614   --   max_bin_num                    - maximum number of bins
615   --   exclude_list                   - column exclusion list
616   --   round_num                      - number of significant digits
617   --   sample_size                    - maximum size of the sample
618   --   bin_schema_name                - definition table schema name
619   --   data_schema_name               - data table schema name
620   --   rem_table_name                 - column removal definition table
621   --   rem_schema_name                - removal definition table schema
622   -- RETURNS
623   --   None
624   -- EXCEPTIONS
625   --   None
626   -- NOTES
627   --   Columns with all NULLs or only one unique value are ignored. The sign
628   --   of bin_num, max_bin_num, sample_size has no effect on the result, only
629   --   the absolute values are being used. The value adjustment for N is done
630   --   in the following order: first, bin_num, then max_bin_num, and then
631   --   discrete column adjustment.
632   --   Column removal definition table is optional. If specified columns with
633   --   all NULLs or only one unique value will be inserted into the column
634   --   removal definition table.
635   -----------------------------------------------------------------------------
639     bin_num                        PLS_INTEGER DEFAULT 3,
636   PROCEDURE insert_autobin_num_eqwidth(
637     bin_table_name                 VARCHAR2,
638     data_table_name                VARCHAR2,
640     max_bin_num                    PLS_INTEGER DEFAULT 100,
641     exclude_list                   COLUMN_LIST DEFAULT NULL,
642     round_num                      PLS_INTEGER DEFAULT 6,
643     sample_size                    PLS_INTEGER DEFAULT 50000,
644     bin_schema_name                VARCHAR2 DEFAULT NULL,
645     data_schema_name               VARCHAR2 DEFAULT NULL,
646     rem_table_name                 VARCHAR2 DEFAULT NULL,
647     rem_schema_name                VARCHAR2 DEFAULT NULL);
648 
649   -----------------------------------------------------------------------------
650   --                              xform_bin_num                              --
651   -----------------------------------------------------------------------------
652   -- NAME
653   --   xform_bin_num - XFORMation NUMerical BINning
654   -- DESCRIPTION
655   --   Creates a view that perfoms numerical binning of the data table. Only
656   --   the columns that are specified in the xform definition are binned, the
657   --   remaining columns do not change.
658   -- PARAMETERS
659   --   bin_table_name                 - xform definition table
660   --   data_table_name                - data table
661   --   xform_view_name                - xform view
662   --   literal_flag                   - literal flag
663   --   bin_schema_name                - xform definition table schema name
664   --   data_schema_name               - data table schema name
665   --   xform_schema_name              - xform view schema name
666   -- RETURNS
667   --   None
668   -- EXCEPTIONS
669   --   None
670   -- NOTES
671   --   Literal flag indicates whether the values in {bin} are valid SQL
672   --   literals. When the the flag is set to TRUE the value of {bin} is used
673   --   as is in query generation, otherwise it is converted into a valid text
674   --   literal (surround by quotes and double the quotes inside). By default
675   --   the flag is set to FALSE. One example when it can be set to TRUE is in
676   --   cases when all {bin} are numbers. In that case the xformed column will
677   --   remain numeric as opposed to textual (default behavior). For example,
678   --   for the following xfrom definition:
679   --       COL     VAL BIN
680   --       ----- ----- -----
681   --       mycol    10 NULL
682   --       mycol  15.5 1
683   --       mycol    21 2
684   --   the following expression is generated when the flag is set to FALSE:
685   --       CASE WHEN "mycol" <  10   THEN NULL
686   --            WHEN "mycol" <= 15.5 THEN '1'
687   --            WHEN "mycol" <= 20   THEN '2'
688   --       END "mycol"
689   --   and when the flag is set to TRUE:
690   --       CASE WHEN "mycol" <  10   THEN NULL
691   --            WHEN "mycol" <= 15.5 THEN 1
692   --            WHEN "mycol" <= 20   THEN 2
693   --       END "mycol"
694   -----------------------------------------------------------------------------
695   PROCEDURE xform_bin_num(
696     bin_table_name                 VARCHAR2,
697     data_table_name                VARCHAR2,
698     xform_view_name                VARCHAR2,
699     literal_flag                   BOOLEAN DEFAULT FALSE,
700     bin_schema_name                VARCHAR2 DEFAULT NULL,
701     data_schema_name               VARCHAR2 DEFAULT NULL,
702     xform_schema_name              VARCHAR2 DEFAULT NULL);
703 
704   -----------------------------------------------------------------------------
705   --                              stack_bin_num                              --
706   -----------------------------------------------------------------------------
707   -- NAME
708   --   stack_bin_num - STACK NUMerical BINning
709   -- DESCRIPTION
710   --   For every column in the stack definition that has a matching entry in
711   --   the transformation definition, stacks numerical binning expression on
712   --   top of the existing expression and updates the stack definition.
713   --   Columns in the stack definition that have no matching entries in the
714   --   transformation definition or have NULL expression are not changed.
715   --   Columns in the transformation definition that have no matching entries
716   --   in the stack definition are added to the stack definition (using "{col}"
717   --   in place of the original expression or "VALUE" for nested attributes).
718   --   Reverse expressions in the stack definition are updated accordingly,
719   --   that is if an expression is updated, added or remains unchanged then a
720   --   corresponding reverse expression is also updated, added or remains
721   --   unchanged. When reverse expression is NULL it is treated as identity
722   --   function ("{attr}" or "VALUE" for nested attributes).
723   --   Given an entry ({attr}, {expr}, {rexp}) in the stack definition and a
724   --   matching entry in the transformation definition, after stacking {expr}
725   --   has the following form:
726   --       CASE WHEN ({expr}) <  {val0}   THEN '{bin0}'
727   --            WHEN ({expr}) <= {val1}   THEN '{bin1}'
728   --            ...
729   --            WHEN ({expr}) <= {valN}   THEN '{binN}'
730   --            WHEN ({expr}) IS NOT NULL THEN '{bin(N+1)}'
731   --       END
732   --   and {rexp} has the following form:
733   --       DECODE("{attr}", '{bin0}',     '( ; {rev0})',
734   --                        '{bin1}',     '[{rev0}; {rev1})',
735   --                        ...
736   --                        '{binN}',     '[{rev(N-1)}; {revN}]',
740   --   If {binI} and {binJ} are equal then the corresponding entries of the
737   --                        '{bin(N+1)}', '({revN}; )',
738   --                        NULL,         'NULL')
739   --   where {revI} is the result of applying reverese expression to {valI}.
741   --   DECODE function above are merged into:
742   --       '{binI}', '[{rev(I-1)}; {revI}), [{rev(J-1)}; {revJ})'
743   --   Note that reverse expressions implicitly maps invalid bins to NULL.
744   --   Example 1. Given transformation definition:
745   --       {col = COL1, val = 0,   bin = NULL}
746   --       {col = COL1, val = 1,   bin = A}
747   --       {col = COL1, val = 2,   bin = B}
748   --       {col = COL1, val = 3,   bin = A}
749   --       {col = COL2, val = 10,  bin = NULL}
750   --       {col = COL2, val = 15,  bin = 1}
751   --       {col = COL2, val = 20,  bin = 2}
752   --   and stack definition:
753   --       {attr = COL1, expr = log(10, COL1), rexp = power(10, COL1)}
754   --       {attr = COL3, expr = ln(COL3),      rexp = exp(COL3)}
755   --   the following updated stack definition is generated:
756   --       {attr = COL1,
757   --        expr = CASE WHEN (log(10, COL1)) <  0 THEN NULL
758   --                    WHEN (log(10, COL1)) <= 1 THEN 'A'
759   --                    WHEN (log(10, COL1)) <= 2 THEN 'B'
760   --                    WHEN (log(10, COL1)) <= 3 THEN 'A' END,
761   --        rexp = DECODE("COL1", 'A',   '[1; 10), [100; 1000]',
762   --                              'B',   '[10; 100)',
763   --                               NULL, '( ; 1), (1000; ), NULL')}
764   --       {attr = COL3,
765   --        expr = ln(COL3),
766   --        rexp = exp(COL3)}
767   --       {attr = COL2,
768   --        expr = CASE WHEN "COL2" <  10 THEN NULL
769   --                    WHEN "COL2" <= 15 THEN '1'
770   --                    WHEN "COL2" <= 20 THEN '2' END
771   --        rexp = DECODE("COL2", '1',  '[10; 15)',
772   --                              '2',  '[15; 20]',
773   --                              NULL, '( ; 10) OR (20; ) OR NULL')}
774   -- PARAMETERS
775   --   bin_table_name                 - xform definition table
776   --   xform_list                     - stack definition
777   --   literal_flag                   - literal flag
778   --   bin_schema_name                - xform definition table schema name
779   -- RETURNS
780   --   xform_list                     - updated stack definition
781   -- EXCEPTIONS
782   --   None
783   -- NOTES
784   --   Literal flag indicates whether the values in {bin} are valid SQL
785   --   literals. When the the flag is set to TRUE the value of {bin} is used
786   --   "as is" in the expression generation, otherwise it is converted into a
787   --   valid text literal (surround by quotes and double the quotes inside). By
788   --   default the flag is set to FALSE. One example when it can be set to TRUE
789   --   is in cases when all {bin} are numbers. In that case the xformed column
790   --   will remain numeric as opposed to textual (default behavior). For
791   --   example, for the following xfrom definition:
792   --       COL     VAL BIN
793   --       ----- ----- -----
794   --       mycol    10 NULL
795   --       mycol    15 1
796   --       mycol    20 2
797   --   the following {expr} and {rexp} are generated when the flag is FALSE:
798   --        expr = CASE WHEN "mycol" <  10 THEN NULL
799   --                    WHEN "mycol" <= 15 THEN '1'
800   --                    WHEN "mycol" <= 20 THEN '2' END
801   --        rexp = DECODE("mycol", '1',  '[10; 15)',
802   --                               '2',  '[15; 20]'
803   --                               NULL, '( ; 10) OR (20; ) OR NULL')
804   --   and when the flag is set to TRUE:
805   --        expr = CASE WHEN "mycol" <  10 THEN NULL
806   --                    WHEN "mycol" <= 15 THEN 1
807   --                    WHEN "mycol" <= 20 THEN 2 END
808   --        rexp = DECODE("mycol", 1,    '[10; 15)',
809   --                               2,    '[15; 20]',
810   --                               NULL, '( ; 10) OR (20; ) OR NULL')
811   -----------------------------------------------------------------------------
812   PROCEDURE stack_bin_num(
813     bin_table_name                 VARCHAR2,
814     xform_list                     IN OUT NOCOPY TRANSFORM_LIST,
815     literal_flag                   BOOLEAN DEFAULT FALSE,
816     bin_schema_name                VARCHAR2 DEFAULT NULL);
817 
818   -----------------------------------------------------------------------------
819   --                            create_bin_cat                               --
820   -----------------------------------------------------------------------------
821   -- NAME
822   --   create_bin_cat - CREATE CATegorical BINning definition table
823   -- DESCRIPTION
824   --   Creates categorical binning definition table:
825   --       CREATE TABLE <bin_cat>(
826   --         col   VARCHAR2(30),
827   --         att   VARCHAR2(4000),
828   --         val   VARCHAR2(4000),
829   --         bin   VARCHAR2(4000))
830   --   This table is used to guide query generation process to construct
831   --   categorical binning expressions of the following form:
832   --       DECODE("{col}", {val1}, {bin1},
833   --                       ...
834   --                       {valN}, {binN},
835   --                       NULL,   NULL,
836   --                               {bin(N+1)}) "{col}"
837   --   This expression maps values {val1}, ..., {valN} into N bins {bin1},...,
838   --   {binN}, and other values into {bin(N+1)}, while NULL values remain
842   --       {col = 'mycol', val = 'Waltham',        bin = 'MA'}
839   --   unchanged. {bin(N+1)} is optional. If it is not specified it defaults
840   --   to NULL. To specify {bin(N+1)} provide a row with {val} = NULL.
841   --   Example 1. <bin_cat> contains four rows with {col} = 'mycol':
843   --       {col = 'mycol', val = 'Burlington',     bin = 'MA'}
844   --       {col = 'mycol', val = 'Redwood Shores', bin = 'CA'}
845   --       {col = 'mycol', val = NULL,             bin = 'OTHER'}
846   --   the following expression is generated:
847   --       DECODE("mycol", 'Waltham',        'MA',
848   --                       'Burlington',     'MA',
849   --                       'Redwood Shores', 'CA',
850   --                       NULL,             NULL,
851   --                                         'OTHER') "mycol"
852   --   Example 2. <bin_cat> contains three rows with {col} = 'mycol':
853   --       {col = 'mycol', val = 'Waltham',        bin = 'MA'}
854   --       {col = 'mycol', val = 'Burlington',     bin = 'MA'}
855   --       {col = 'mycol', val = 'Redwood Shores', bin = 'CA'}
856   --   the following expression is generated:
857   --       DECODE("mycol", 'Waltham',        'MA',
858   --                       'Burlington',     'MA',
859   --                       'Redwood Shores', 'CA') "mycol"
860   -- PARAMETERS
861   --   bin_table_name                 - categorical binning definition table
862   --   bin_schema_name                - definition table schema name
863   -- RETURNS
864   --   None
865   -- EXCEPTIONS
866   --   None
867   -- NOTES
868   --   Note, that {col} is case sensitive since it generates quoted
869   --   identifiers. In cases when there are multiple entries with the same
870   --   {col}, {val} combiniation with different {bin} the behavior is
871   --   undefined. Any one of the {bin} might be used. The maximum number of
872   --   arguments of a DECODE function is 255.
873   -----------------------------------------------------------------------------
874   PROCEDURE create_bin_cat(
875     bin_table_name                 VARCHAR2,
876     bin_schema_name                VARCHAR2 DEFAULT NULL);
877 
878   -----------------------------------------------------------------------------
879   --                          insert_bin_cat_freq                            --
880   -----------------------------------------------------------------------------
881   -- NAME
882   --   insert_bin_cat_freq - INSERT into CATegorical BINning top-FREQuency
883   -- DESCRIPTION
884   --   For every VARCHAR2, CHAR column in the data table that is not in the
885   --   exclusion list finds categorical binning definition and inserts it into
886   --   the definition table. Definition for each relevant column is computed
887   --   based on occurence frequency of column values that are computed from
888   --   the data table reference. Each of the N (bin_num) bins {bin1}, ...,
889   --   {binN} correspond to the values with top frequencies when N > 0 or
890   --   bottom frequencies when N < 0, and {bin(N+1)} to all remaining
891   --   values, where {binI} = I. Ordering ties among identical frequencies are
892   --   broken by ordering on column values (ASC for N > 0 or DESC for N < 0).
893   --   When the the number of distinct values C < N only C+1 bins will be
894   --   created. Parameter default_num (D) is used for prunning based on the
895   --   number of values that fall in the default bin. When D > 0 only columns
896   --   that have at least D defaults are kept while others are ignored. When
897   --   D < 0 only columns that have at most D values are kept. No prunning is
898   --   done when D is NULL or when D = 0. Parameter bin_support (SUP) is used
899   --   for restricting bins to frequent (SUP > 0) values frq >= SUP*tot, or
900   --   infrequent (SUP < 0) ones frq <= (-SUP)*tot, where frq is a given value
901   --   count and tot is a sum of all counts as computed from the data. No
902   --   support filtering is done when SUP is NULL or when SUP = 0.
903   -- PARAMETERS
904   --   bin_table_name                 - categorical binning definition table
905   --   data_table_name                - data table
906   --   bin_num                        - number of bins
907   --   exclude_list                   - column exclusion list
908   --   default_num                    - number of default values
909   --   bin_support                    - bin support (fraction)
910   --   bin_schema_name                - definition table schema name
911   --   data_schema_name               - data table schema name
912   -- RETURNS
913   --   None
914   -- EXCEPTIONS
915   --   None
916   -- NOTES
917   --   Nothing is done when bin_num IS NULL or bin_num = 0. NULL values
918   --   are not counted. Columns with all NULLs are ignored.
919   -----------------------------------------------------------------------------
920   PROCEDURE insert_bin_cat_freq(
921     bin_table_name                 VARCHAR2,
922     data_table_name                VARCHAR2,
923     bin_num                        PLS_INTEGER DEFAULT 9,
924     exclude_list                   COLUMN_LIST DEFAULT NULL,
925     default_num                    PLS_INTEGER DEFAULT 2,
926     bin_support                    NUMBER DEFAULT NULL,
927     bin_schema_name                VARCHAR2 DEFAULT NULL,
928     data_schema_name               VARCHAR2 DEFAULT NULL);
929 
930   -----------------------------------------------------------------------------
931   --                              xform_bin_cat                              --
932   -----------------------------------------------------------------------------
933   -- NAME
934   --   xform_bin_cat - XFORMation CATegorical BINning
935   -- DESCRIPTION
939   -- PARAMETERS
936   --   Creates a view that perfoms categorical binning of the data table. Only
937   --   the columns that are specified in the xform definition are binned, the
938   --   remaining columns do not change.
940   --   bin_table_name                 - xform definition table
941   --   data_table_name                - data table
942   --   xform_view_name                - xform view
943   --   litiral_flag                   - literal flag
944   --   bin_schema_name                - xform definition table schema name
945   --   data_schema_name               - data table schema name
946   --   xform_schema_name              - xform view schema name
947   -- RETURNS
948   --   None
949   -- EXCEPTIONS
950   --   None
951   -- NOTES
952   --   Literal flag indicates whether the values in {bin} are valid SQL
953   --   literals. When the the flag is set to TRUE the value of {bin} is used
954   --   as is in query generation, otherwise it is converted into a valid text
955   --   literal (surround by quotes and double the quotes inside). By default
956   --   the flag is set to FALSE. One example when it can be set to TRUE is in
957   --   cases when all {bin} are numbers. In that case the xformed column will
958   --   be numeric as opposed to textual (default behavior). For example,
959   --   for the following xfrom definition:
960   --       COL   VAL            BIN
961   --       ----- -------------- ----
962   --       mycol Waltham        1
963   --       mycol Burlington     1
964   --       mycol Redwood Shores 2
965   --   the following expression is generated when the flag is set to FALSE:
966   --       DECODE("mycol", 'Waltham',       '1',
967   --                       'Burlington',    '1',
968   --                       'Redwood Shores','2') "mycol"
969   --   and when the flag is set to TRUE:
970   --       DECODE("mycol", 'Waltham',        1,
971   --                       'Burlington',     1,
972   --                       'Redwood Shores', 2) "mycol"
973   -----------------------------------------------------------------------------
974   PROCEDURE xform_bin_cat(
975     bin_table_name                 VARCHAR2,
976     data_table_name                VARCHAR2,
977     xform_view_name                VARCHAR2,
978     literal_flag                   BOOLEAN DEFAULT FALSE,
979     bin_schema_name                VARCHAR2 DEFAULT NULL,
980     data_schema_name               VARCHAR2 DEFAULT NULL,
981     xform_schema_name              VARCHAR2 DEFAULT NULL);
982 
983   -----------------------------------------------------------------------------
984   --                              stack_bin_cat                              --
985   -----------------------------------------------------------------------------
986   -- NAME
987   --   stack_bin_cat - STACK CATegorical BINning
988   -- DESCRIPTION
989   --   For every column in the stack definition that has a matching entry in
990   --   the transformation definition, stacks categorical binning expression on
991   --   top of the existing expression and updates the stack definition.
992   --   Columns in the transformation definition that have no matching entries
993   --   in the stack definition are added to the stack definition (using "{col}"
994   --   in place of the original expression or "VALUE" for nested attributes).
995   --   Reverse expressions in the stack definition are updated accordingly,
996   --   that is if an expression is updated, added or remains unchanged then a
997   --   corresponding reverse expression is also updated, added or remains
998   --   unchanged. When reverse expression is NULL it is treated as identity
999   --   function ("{attr}" or "VALUE" for nested attributes).
1000   --   Given an entry ({attr}, {expr}, {rexp}) in the stack definition and a
1001   --   matching entry in the transformation definition, after stacking {expr}
1002   --   has the following form:
1003   --       DECODE({expr}, '{val1}', '{bin1}',
1004   --                      ...
1005   --                      '{valN}', '{binN}',
1006   --                      NULL,     NULL,
1007   --                                '{bin(N+1)}')
1008   --   and {rexp} has the following form:
1009   --       DECODE("{attr}", '{bin1}',     '{rev1}',
1010   --                        ...
1011   --                        '{binN}',     '{revN}',
1012   --                        NULL,         'NULL',
1013   --                        '{bin(N+1)}', 'DEFAULT')
1014   --   where {revI} is the result of applying reverese expression to {valI}.
1015   --   If {binI} and {binJ} are equal then the corresponding entries of the
1016   --   DECODE function above are merged into:
1017   --       '{binI}', '{revI}, {revJ}'
1018   --   If more than one entry maps to the default bin {bin(N+1)} they are all
1019   --   merged to
1020   --       '{bin(N+1)}', 'DEFAULT'
1021   --   Note that reverse expression implicitly maps invalid bins to NULL.
1022   --   Example 1. Given transformation definition:
1023   --       {col = COL1, val = waltham,        bin = MA}
1024   --       {col = COL1, val = burlington,     bin = MA}
1025   --       {col = COL1, val = redwood shores, bin = CA}
1026   --       {col = COL2, val = MA,             bin = East}
1027   --       {col = COL2, val = CA,             bin = West}
1028   --       {col = COL2, val = NULL,           bin = USA}
1029   --   and stack definition:
1030   --       {attr = COL1, expr = lower(COL1), rexp = initcap(COL1)}
1034   --        expr = DECODE(lower(COL1), 'waltham',        'MA',
1031   --       {attr = COL3, expr = upper(COL3), rexp = initcap(COL3)}
1032   --   the following updated stack definition is generated:
1033   --       {attr = COL1,
1035   --                                   'burlington',     'MA',
1036   --                                   'redwood shores', 'CA'),
1037   --        rexp = DECODE("COL1", 'MA', '''Waltham'', ''Burlington''',
1038   --                              'CA', '''Redwood Shores''',
1039   --                              NULL, 'DEFAULT')}
1040   --       {attr = COL3,
1041   --        expr = upper(COL3),
1042   --        rexp = initcap(COL3)}
1043   --       {attr = COL2,
1044   --        expr = DECODE("COL2", 'MA', 'East',
1045   --                              'NY', 'East',
1046   --                              'CA', 'West',
1047   --                              NULL, NULL,
1048   --                                    'USA')
1049   --        rexp = DECODE("COL2", 'East', '''MA''',
1050   --                              'West', '''CA''',
1051   --                              NULL,   'NULL',
1052   --                              'USA',  'DEFAULT')}
1053   -- PARAMETERS
1054   --   bin_table_name                 - xform definition table
1055   --   xform_list                     - stack definition
1056   --   literal_flag                   - literal flag
1057   --   bin_schema_name                - xform definition table schema name
1058   -- RETURNS
1059   --   xform_list                     - updated stack definition
1060   -- EXCEPTIONS
1061   --   None
1062   -- NOTES
1063   --   Literal flag indicates whether the values in {bin} are valid SQL
1064   --   literals. When the the flag is set to TRUE the value of {bin} is used
1065   --   "as is" in the expression generation, otherwise it is converted into a
1066   --   valid text literal (surround by quotes and double the quotes inside). By
1067   --   default the flag is set to FALSE. One example when it can be set to TRUE
1068   --   is in cases when all {bin} are numbers. In that case the xformed column
1069   --   will remain numeric as opposed to textual (default behavior). For
1070   --   example, for the following xfrom definition:
1071   --       COL   VAL            BIN
1072   --       ----- -------------- ----
1073   --       mycol Waltham        1
1074   --       mycol Burlington     1
1075   --       mycol Redwood Shores 2
1076   --   the following {expr} and {rexp} are generated when the flag is FALSE:
1077   --        expr = DECODE("mycol", 'Waltham',        '1',
1078   --                               'Burlington',     '1',
1079   --                               'Redwood Shores', '2')
1080   --        rexp = DECODE("COL1", '1',  '''Waltham'', ''Burlington''',
1081   --                              '2',  '''Redwood Shores''',
1082   --                              NULL, 'DEFAULT')
1083   --   and when the flag is set to TRUE:
1084   --        expr = DECODE("mycol", 'Waltham',        1,
1085   --                               'Burlington',     1,
1086   --                               'Redwood Shores', 2)
1087   --        rexp = DECODE("COL1", 1,    '''Waltham'', ''Burlington''',
1088   --                              2,    '''Redwood Shores''',
1089   --                              NULL, 'DEFAULT')
1090   -----------------------------------------------------------------------------
1091   PROCEDURE stack_bin_cat(
1092     bin_table_name                 VARCHAR2,
1093     xform_list                     IN OUT NOCOPY TRANSFORM_LIST,
1094     literal_flag                   BOOLEAN DEFAULT FALSE,
1095     bin_schema_name                VARCHAR2 DEFAULT NULL);
1096 
1097   -----------------------------------------------------------------------------
1098   --                            insert_bin_super                             --
1099   -----------------------------------------------------------------------------
1100   -- NAME
1101   --   insert_bin_super - INSERT into BINning SUPERvised
1102   -- DESCRIPTION
1103   --   For every NUMBER, VARCHAR2, or CHAR, as well as DM_NESTED_NUMERICAL, or
1104   --   DM_NESTED_CATEGORICAL column in the data table that is not in the
1105   --   exclusion list finds numerical and categorical binning definition
1106   --   and inserts them into the corresponding definition tables. Definition
1107   --   for each relevant column is computed based on the splits found by the
1108   --   Decision Tree model build on a single predictor. Columns that have no
1109   --   splits are inserted into the column removal definition table.
1110   -- PARAMETERS
1111   --   num_table_name                 - numerical binning definition table
1112   --   cat_table_name                 - categorical binning definition table
1113   --   data_table_name                - data table
1114   --   target_column_name             - target column
1115   --   max_bin_num                    - maximum number of bins
1116   --   exclude_list                   - column exclusion list
1117   --   num_schema_name                - numerical definition table schema
1118   --   cat_schema_name                - categorical definition table schema
1119   --   data_schema_name               - data table schema name
1120   --   rem_table_name                 - column removal definition table
1121   --   rem_schema_name                - removal definition table schema
1122   -- RETURNS
1123   --   None
1124   -- EXCEPTIONS
1125   --   None
1126   -- NOTES
1127   --   Column removal definition table is optional.
1128   -----------------------------------------------------------------------------
1129   PROCEDURE insert_bin_super(
1130     num_table_name                 VARCHAR2,
1131     cat_table_name                 VARCHAR2,
1132     data_table_name                VARCHAR2,
1136     num_schema_name                VARCHAR2 DEFAULT NULL,
1133     target_column_name             VARCHAR2,
1134     max_bin_num                    PLS_INTEGER DEFAULT 1000,
1135     exclude_list                   COLUMN_LIST DEFAULT NULL,
1137     cat_schema_name                VARCHAR2 DEFAULT NULL,
1138     data_schema_name               VARCHAR2 DEFAULT NULL,
1139     rem_table_name                 VARCHAR2 DEFAULT NULL,
1140     rem_schema_name                VARCHAR2 DEFAULT NULL);
1141 
1142   -----------------------------------------------------------------------------
1143   --                             xform_expr_num                              --
1144   -----------------------------------------------------------------------------
1145   -- NAME
1146   --   xform_expr_num - XFORMation EXPRession NUMber
1147   -- DESCRIPTION
1148   --   Creates a view that applies a given expression for every NUMBER column
1149   --   in the data table that is not in the exclusion list and in the
1150   --   inclusion list. The remaining columns do not change. Expressions are
1151   --   constructed from the expression pattern by replacing every occurance of
1152   --   the column pattern with an actual column name.
1153   --   Example 1. For a table TAB with two NUMBER columns CN1, CN3 and one
1154   --   CHAR columns CC2 and expression pattern TO_CHAR(:col) the following
1155   --   query is generated:
1156   --       SELECT TO_CHAR("CN1") "CN1", "CC2", TO_CHAR("CN3") "CN3"
1157   --         FROM TAB
1158   --   Example 2. This procedure can be used for clipping (winsorizing)
1159   --   normalized data to a [0..1] range, that is values x > 1 become 1 and
1160   --   values x < 0 become 0. For the table in example 1 and pattern
1161   --       CASE WHEN :col < 0 THEN 0 WHEN :col > 1 THEN 1 ELSE :col END
1162   --   the following query is generated:
1163   --       SELECT CASE WHEN "CN1" < 0 THEN 0 WHEN "CN1" > 1 THEN 1
1164   --                   ELSE "CN1" END "CN1",
1165   --              "CC2",
1166   --              CASE WHEN "CN3" < 0 THEN 0 WHEN "CN3" > 1 THEN 1
1167   --                   ELSE "CN3" END "CN3"
1168   --         FROM TAB
1169   -- PARAMETERS
1170   --   expr_pattern                   - expression pattern
1171   --   data_table_name                - data table
1172   --   xform_view_name                - xform view
1173   --   exclude_list                   - column exclusion list
1174   --   include_list                   - column inclusion list
1175   --   col_pattern                    - column pattern
1176   --   data_schema_name               - data table schema name
1177   --   xform_schema_name              - xform view schema name
1178   -- RETURNS
1179   --   None
1180   -- EXCEPTIONS
1181   --   None
1182   -- NOTES
1183   --   The default value of column pattern is ':col'. Column pattern is case
1184   --   sensetive. Expressions are constructed using SQL REPLACE function:
1185   --       REPALCE(expr_pattern, col_pattern, '"<column>"')||' "<column>"'
1186   --   NULL exclusion list is treated as an empty set (exclude none) and NULL
1187   --   inclusion list is treated as a full set (include all).
1188   -----------------------------------------------------------------------------
1189   PROCEDURE xform_expr_num(
1190     expr_pattern                   VARCHAR2,
1191     data_table_name                VARCHAR2,
1192     xform_view_name                VARCHAR2,
1193     exclude_list                   COLUMN_LIST DEFAULT NULL,
1194     include_list                   COLUMN_LIST DEFAULT NULL,
1195     col_pattern                    VARCHAR2 DEFAULT ':col',
1196     data_schema_name               VARCHAR2 DEFAULT NULL,
1197     xform_schema_name              VARCHAR2 DEFAULT NULL);
1198 
1199   -----------------------------------------------------------------------------
1200   --                             xform_expr_str                              --
1201   -----------------------------------------------------------------------------
1202   -- NAME
1203   --   xform_expr_str - XFORMation EXPRession STRing
1204   -- DESCRIPTION
1205   --   Similar to xform_expr_num, except that it applies to CHAR and VARCHAR2
1206   --   columns instead of NUMBER.
1207   -- PARAMETERS
1208   --   expr_pattern                   - expression pattern
1209   --   data_table_name                - data table
1210   --   xform_view_name                - xform view
1211   --   exclude_list                   - column exclusion list
1212   --   include_list                   - column inclusion list
1213   --   col_pattern                    - column pattern
1214   --   data_schema_name               - data table schema name
1215   --   xform_schema_name              - xform view schema name
1216   -- RETURNS
1217   --   None
1218   -- EXCEPTIONS
1219   --   None
1220   -- NOTES
1221   --   None
1222   -----------------------------------------------------------------------------
1223   PROCEDURE xform_expr_str(
1224     expr_pattern                   VARCHAR2,
1225     data_table_name                VARCHAR2,
1226     xform_view_name                VARCHAR2,
1227     exclude_list                   COLUMN_LIST DEFAULT NULL,
1228     include_list                   COLUMN_LIST DEFAULT NULL,
1229     col_pattern                    VARCHAR2 DEFAULT ':col',
1230     data_schema_name               VARCHAR2 DEFAULT NULL,
1231     xform_schema_name              VARCHAR2 DEFAULT NULL);
1232 
1233   -----------------------------------------------------------------------------
1234   --                              create_clip                                --
1235   -----------------------------------------------------------------------------
1236   -- NAME
1237   --   create_clip - CREATE CLIPping definition table
1241   --         col  VARCHAR2(30),
1238   -- DESCRIPTION
1239   --   Creates clippping definition table:
1240   --       CREATE TABLE <clip>(
1242   --         att  VARCHAR2(4000),
1243   --         lcut NUMBER,
1244   --         lval NUMBER,
1245   --         rcut NUMBER,
1246   --         rval NUMBER)
1247   --   This table is used to guide query generation process to construct
1248   --   clipping expressions of the following form:
1249   --       CASE WHEN "{col}" < {lcut} THEN {lval}
1250   --            WHEN "{col}" > {rcut} THEN {rval}
1251   --                                  ELSE "{col}"
1252   --       END "{col}"
1253   --   Example 1. (winsorizing) When col = 'my_col', lcut = -1.5, lval = -1.5,
1254   --   and rcut = 4.5 and rval = 4.5 the following expression is generated:
1255   --       CASE WHEN "my_col" < -1.5 THEN -1.5
1256   --            WHEN "my_col" >  4.5 THEN  4.5
1257   --                                 ELSE "my_col"
1258   --       END "my_col"
1259   --   Example 2. (trimming) When col = 'my_col', lcut = -1.5, lval = NULL,
1260   --   and rcut = 4.5 and rval = NULL the following expression is generated:
1261   --       CASE WHEN "my_col" < -1.5 THEN NULL
1262   --            WHEN "my_col" >  4.5 THEN NULL
1263   --                                 ELSE "my_col"
1264   --       END "my_col"
1265   -- PARAMETERS
1266   --   clip_table_name                - clipping definition table
1267   --   clip_schema_name               - clipping definition table schema name
1268   -- RETURNS
1269   --   None
1270   -- EXCEPTIONS
1271   --   None
1272   -- NOTES
1273   --   Note, that {col} is case sensitive since it generates quoted
1274   --   identifiers.  When there are multiple entries in the xform defintion
1275   --   table for the same {col} the behavior is undefined.  Any one of the
1276   --   definitions may be used in query generation. NULL values remain
1277   --   unchanged.
1278   -----------------------------------------------------------------------------
1279   PROCEDURE create_clip(
1280     clip_table_name                VARCHAR2,
1281     clip_schema_name               VARCHAR2 DEFAULT NULL);
1282 
1283   -----------------------------------------------------------------------------
1284   --                        insert_clip_winsor_tail                          --
1285   -----------------------------------------------------------------------------
1286   -- NAME
1287   --   insert_clip_winsor_tail - INSERT into CLIPping WINSORizing TAIL
1288   -- DESCRIPTION
1289   --   For every NUMBER column in the data table that is not in the exclusion
1290   --   list finds clipping definition and inserts it into the definition
1291   --   table. Definition for each relevant column is computed based on the
1292   --   non-NULL values sorted in ascending order such that val(1)<val(2)<...<
1293   --   val(N), where N is a total number of non-NULL values in a column:
1294   --       lcut = val(1+floor(N*q))
1295   --       lval = lcut
1296   --       rcut = val(N-floor(N*q))
1297   --       rval = rcut
1298   --   where q = ABS(NVL(tail_frac,0)). Nothing is done when q >= 0.5.
1299   -- PARAMETERS
1300   --   clip_table_name                - clipping definition table
1301   --   data_table_name                - data table
1302   --   tail_frac                      - tail fraction
1303   --   exclude_list                   - column exclusion list
1304   --   clip_schema_name               - definition table schema name
1305   --   data_schema_name               - data table schema name
1306   -- RETURNS
1307   --   None
1308   -- EXCEPTIONS
1309   --   None
1310   -- NOTES
1311   --   Columns with all NULLs are ignored.
1312   -----------------------------------------------------------------------------
1313   PROCEDURE insert_clip_winsor_tail(
1314     clip_table_name                VARCHAR2,
1315     data_table_name                VARCHAR2,
1316     tail_frac                      NUMBER DEFAULT 0.025,
1317     exclude_list                   COLUMN_LIST DEFAULT NULL,
1318     clip_schema_name               VARCHAR2 DEFAULT NULL,
1319     data_schema_name               VARCHAR2 DEFAULT NULL);
1320 
1321   -----------------------------------------------------------------------------
1322   --                        insert_clip_trim_tail                            --
1323   -----------------------------------------------------------------------------
1324   -- NAME
1325   --   insert_clip_trim_tail - INSERT into CLIPping TRIMming TAIL
1326   -- DESCRIPTION
1327   --   For every NUMBER column in the data table that is not in the exclusion
1328   --   list finds clipping definition and inserts it into the definition
1329   --   table. Definition for each relevant column is computed based on the
1330   --   non-NULL values sorted in ascending order such that val(1)<val(2)<...<
1331   --   val(N), where N is a total number of non-NULL values in a column:
1332   --       lcut = val(1+floor(N*q))
1333   --       lval = NULL
1334   --       rcut = val(N-floor(N*q))
1335   --       rval = NULL
1336   --   where q = ABS(NVL(tail_frac,0)). Nothing is done when q >= 0.5.
1337   -- PARAMETERS
1338   --   clip_table_name                - clipping definition table
1339   --   data_table_name                - data table
1340   --   tail_frac                      - tail fraction
1341   --   exclude_list                   - column exclusion list
1342   --   clip_schema_name               - definition table schema name
1343   --   data_schema_name               - data table schema name
1344   -- RETURNS
1345   --   None
1346   -- EXCEPTIONS
1347   --   None
1348   -- NOTES
1349   --   Columns with all NULLs are ignored.
1353     data_table_name                VARCHAR2,
1350   -----------------------------------------------------------------------------
1351   PROCEDURE insert_clip_trim_tail(
1352     clip_table_name                VARCHAR2,
1354     tail_frac                      NUMBER DEFAULT 0.025,
1355     exclude_list                   COLUMN_LIST DEFAULT NULL,
1356     clip_schema_name               VARCHAR2 DEFAULT NULL,
1357     data_schema_name               VARCHAR2 DEFAULT NULL);
1358 
1359   -----------------------------------------------------------------------------
1360   --                               xform_clip                                --
1361   -----------------------------------------------------------------------------
1362   -- NAME
1363   --   xform_clip - XFORMation CLIPping
1364   -- DESCRIPTION
1365   --   Creates a view that perfoms clipping of the data table. Only the
1366   --   columns that are specified in the xform definition are clipped, the
1367   --   remaining columns do not change.
1368   -- PARAMETERS
1369   --   clip_table_name                - xform definition table
1370   --   data_table_name                - data table
1371   --   xform_view_name                - xform view
1372   --   clip_schema_name               - xform definition table schema name
1373   --   data_schema_name               - data table schema name
1374   --   xform_schema_name              - xform view schema name
1375   -- RETURNS
1376   --   None
1377   -- EXCEPTIONS
1378   --   None
1379   -- NOTES
1380   --   None
1381   -----------------------------------------------------------------------------
1382   PROCEDURE xform_clip(
1383     clip_table_name                VARCHAR2,
1384     data_table_name                VARCHAR2,
1385     xform_view_name                VARCHAR2,
1386     clip_schema_name               VARCHAR2 DEFAULT NULL,
1387     data_schema_name               VARCHAR2 DEFAULT NULL,
1388     xform_schema_name              VARCHAR2 DEFAULT NULL);
1389 
1390   -----------------------------------------------------------------------------
1391   --                                stack_clip                               --
1392   -----------------------------------------------------------------------------
1393   -- NAME
1394   --   stack_clip - STACK CLIPping
1395   -- DESCRIPTION
1396   --   For every column in the stack definition that has a matching entry in
1397   --   the transformation definition, stacks clipping expression on top of the
1398   --   existing expression and updates the stack definition. Columns in the
1399   --   stack definition that have no matching entries in the transformation
1400   --   definition or have NULL expression are not changed. Columns in the
1401   --   transformation definition that have no matching entries in the stack
1402   --   definition are added to the stack definition (using "{col}" in place of
1403   --   the original expression or "VALUE" for nested attributes).
1404   --   Reverse expressions in the stack definition are updated accordingly,
1405   --   that is if an expression is updated, added or remains unchanged then a
1406   --   corresponding reverse expression is also updated, added or remains
1407   --   unchanged. When reverse expression is NULL it is treated as identity
1408   --   function ("{attr}" or "VALUE" for nested attributes).
1409   --   Given an entry ({attr}, {expr}, {rexp}) in the stack definition and a
1410   --   matching entry in the transformation definition, after stacking {expr}
1411   --   has the following form:
1412   --       CASE WHEN {expr} < {lcut} THEN {lval}
1413   --            WHEN {expr} > {rcut} THEN {rval}
1414   --                                 ELSE {expr}
1415   --       END
1416   --   and {rexp} remains unchanged
1417   --   Example 1. Given transformation definition:
1418   --       {col = COL1, lcut = -1.5, lval = -1.5, rcut = 4.5, rval = 4.5}
1419   --       {col = COL2, lcut = 0,    lval = 0,    rcut = 1,   rval = 1}
1420   --   and stack definition:
1421   --       {attr = COL1, expr = log(10, COL1), rexp = power(10, COL1)}
1422   --       {attr = COL3, expr = ln(COL3),      rexp = exp(COL3)}
1423   --   the following updated stack definition is generated:
1424   --       {attr = COL1,
1425   --        expr = CASE WHEN log(10, COL1) < -1.5 THEN -1.5
1426   --                    WHEN log(10, COL1) > 4.5  THEN 4.5
1427   --                                              ELSE log(10, COL1)
1428   --               END,
1429   --        rexp = power(10, COL1)}
1430   --       {attr = COL3,
1431   --        expr = ln(COL3),
1432   --        rexp = exp(COL3)}
1433   --       {attr = COL2,
1434   --        expr = CASE WHEN COL2 < 0 THEN 0
1435   --                    WHEN COL2 > 1 THEN 1
1436   --                                  ELSE COL2
1437   --               END,
1438   --        rexp = NULL}
1439   -- PARAMETERS
1440   --   clip_table_name                - xform definition table
1441   --   xform_list                     - stack definition
1442   --   clip_schema_name               - xform definition table schema name
1443   -- RETURNS
1444   --   xform_list                     - updated stack definition
1445   -- EXCEPTIONS
1446   --   None
1447   -- NOTES
1448   --   None
1449   -----------------------------------------------------------------------------
1450   PROCEDURE stack_clip(
1451     clip_table_name                VARCHAR2,
1452     xform_list                     IN OUT NOCOPY TRANSFORM_LIST,
1453     clip_schema_name               VARCHAR2 DEFAULT NULL);
1454 
1455   -----------------------------------------------------------------------------
1456   --                            create_miss_num                              --
1460   --                     table
1457   -----------------------------------------------------------------------------
1458   -- NAME
1459   --   create_miss_num - CREATE NUMerical MISSing value treatment definition
1461   -- DESCRIPTION
1462   --   Creates numerical missing value treatment definition table:
1463   --       CREATE TABLE <miss_num>(
1464   --         col VARCHAR2(30),
1465   --         att VARCHAR2(4000),
1466   --         val NUMBER)
1467   --   This table is used to guide query generation process to construct
1468   --   missing value treatment expressions of the following form:
1469   --       NVL("{col}", {val}) "{col}"
1470   --   For example when col = 'my_col', val = 20 the
1471   --   following expression is generated:
1472   --       NVL("my_col", 20) "my_col"
1473   -- PARAMETERS
1474   --   miss_table_name                - definition table
1475   --   miss_schema_name               - definition table schema name
1476   -- RETURNS
1477   --   None
1478   -- EXCEPTIONS
1479   --   None
1480   -- NOTES
1481   --   Note, that {col} is case sensitive since it generates quoted
1482   --   identifiers.  When there are multiple entries in the xform defintion
1483   --   table for the same {col} the behavior is undefined.  Any one of the
1484   --   definitions may be used in query generation.
1485   -----------------------------------------------------------------------------
1486   PROCEDURE create_miss_num(
1487     miss_table_name                VARCHAR2,
1488     miss_schema_name               VARCHAR2 DEFAULT NULL);
1489 
1490   -----------------------------------------------------------------------------
1491   --                           insert_miss_num_mean                          --
1492   -----------------------------------------------------------------------------
1493   -- NAME
1494   --   insert_miss_num_mean - INSERT into NUMerical MISSining value treatment
1495   --                          MEAN
1496   -- DESCRIPTION
1497   --   For every NUMBER column in the data table that is not in the exclusion
1498   --   list finds missing value treatment definition and inserts it into the
1499   --   definition table. Definition for each relevant column is computed based
1500   --   on the mean (average) value that is computed from the data table:
1501   --       val = mean
1502   --   The value of mean is rounded to round_num significant digits prior to
1503   --   storing it in the definition table.
1504   -- PARAMETERS
1505   --   miss_table_name                - definition table
1506   --   data_table_name                - data table
1507   --   exclude_list                   - column exclusion list
1508   --   round_num                      - number of significant digits
1509   --   miss_schema_name               - definition table schema name
1510   --   data_schema_name               - data table schema name
1511   -- RETURNS
1512   --   None
1513   -- EXCEPTIONS
1514   --   None
1515   -- NOTES
1516   --   Columns with all NULLs are ignored.
1517   -----------------------------------------------------------------------------
1518   PROCEDURE insert_miss_num_mean(
1519     miss_table_name                VARCHAR2,
1520     data_table_name                VARCHAR2,
1521     exclude_list                   COLUMN_LIST DEFAULT NULL,
1522     round_num                      PLS_INTEGER DEFAULT 6,
1523     miss_schema_name               VARCHAR2 DEFAULT NULL,
1524     data_schema_name               VARCHAR2 DEFAULT NULL);
1525 
1526   -----------------------------------------------------------------------------
1527   --                              xform_miss_num                             --
1528   -----------------------------------------------------------------------------
1529   -- NAME
1530   --   xform_miss_num - XFORMation NUMerical MISSing value treatment
1531   -- DESCRIPTION
1532   --   Creates a view that perfoms numerical missing value treatment of the
1533   --   data table. Only the columns that are specified in the xform definition
1534   --   are treated, the remaining columns do not change.
1535   -- PARAMETERS
1536   --   miss_table_name                - xform definition table
1537   --   data_table_name                - data table
1538   --   xform_view_name                - xform view
1539   --   miss_schema_name               - xform definition table schema name
1540   --   data_schema_name               - data table schema name
1541   --   xform_schema_name              - xform view schema name
1542   -- RETURNS
1543   --   None
1544   -- EXCEPTIONS
1545   --   None
1546   -- NOTES
1547   --   None
1548   -----------------------------------------------------------------------------
1549   PROCEDURE xform_miss_num(
1550     miss_table_name                VARCHAR2,
1551     data_table_name                VARCHAR2,
1552     xform_view_name                VARCHAR2,
1553     miss_schema_name               VARCHAR2 DEFAULT NULL,
1554     data_schema_name               VARCHAR2 DEFAULT NULL,
1555     xform_schema_name              VARCHAR2 DEFAULT NULL);
1556 
1557   -----------------------------------------------------------------------------
1558   --                              stack_miss_num                             --
1559   -----------------------------------------------------------------------------
1560   -- NAME
1561   --   stack_miss_num - STACK NUMerical MISSing value treatment
1562   -- DESCRIPTION
1563   --   For every column in the stack definition that has a matching entry in
1564   --   the transformation definition, stacks numerical missing value treatment
1565   --   expression on top of the existing expression and updates the stack
1569   --   matching entries in the stack definition are added to the stack
1566   --   definition. Columns in the stack definition that have no matching
1567   --   entries in the transformation definition or have NULL expression are
1568   --   not changed. Columns in the transformation definition that have no
1570   --   definition (using "{col}" in place of the original expression or "VALUE"
1571   --   for nested attributes). Reverse expressions in the stack definition are
1572   --   updated accordingly, that is if an expression is updated, added or
1573   --   remains unchanged then a corresponding reverse expression is also
1574   --   updated, added or remains unchanged. When reverse expression is NULL it
1575   --   is treated as identity function ("{attr}" or "VALUE" for nested
1576   --   attributes). Given an entry ({attr}, {expr}, {rexp}) in the stack
1577   --   definition and a matching entry in the transformation definition, after
1578   --   stacking {expr} has the following form:
1579   --       NVL({expr}, {val})
1580   --   and {rexp} remains unchanged
1581   --   Example 1. Given transformation definition:
1582   --       {col = COL1, val = 4.5}
1583   --       {col = COL2, val = 0}
1584   --   and stack definition:
1585   --       {attr = COL1, expr = log(10, COL1), rexp = power(10, COL1)}
1586   --       {attr = COL3, expr = ln(COL3),      rexp = exp(COL3)}
1587   --   the following updated stack definition is generated:
1588   --       {attr = COL1,
1589   --        expr = NVL(log(10, COL1), 4.5),
1590   --        rexp = power(10, COL1)}
1591   --       {attr = COL3,
1592   --        expr = ln(COL3),
1593   --        rexp = exp(COL3)}
1594   --       {attr = COL2,
1595   --        expr = NVL(COL2, 0),
1596   --        rexp = NULL}
1597   -- PARAMETERS
1598   --   miss_table_name                - xform definition table
1599   --   xform_list                     - stack definition
1600   --   miss_schema_name               - xform definition table schema name
1601   -- RETURNS
1602   --   xform_list                     - updated stack definition
1603   -- EXCEPTIONS
1604   --   None
1605   -- NOTES
1606   --   None
1607   -----------------------------------------------------------------------------
1608   PROCEDURE stack_miss_num(
1609     miss_table_name                VARCHAR2,
1610     xform_list                     IN OUT NOCOPY TRANSFORM_LIST,
1611     miss_schema_name               VARCHAR2 DEFAULT NULL);
1612 
1613   -----------------------------------------------------------------------------
1614   --                            create_miss_cat                              --
1615   -----------------------------------------------------------------------------
1616   -- NAME
1617   --   create_miss_cat - CREATE CATegorical MISSing value treatment definition
1618   --                     table
1619   -- DESCRIPTION
1620   --   Creates categorical missing value treatment definition table:
1621   --       CREATE TABLE <miss_cat>(
1622   --         col VARCHAR2(30),
1623   --         att VARCHAR2(4000),
1624   --         val VARCHAR2(4000))
1625   --   This table is used to guide query generation process to construct
1626   --   missing value treatment expressions of the following form:
1627   --       NVL("{col}", {val}) "{col}"
1628   --   For example when col = 'zip_code', val = 'MA' the
1629   --   following expression is generated:
1630   --       NVL("zip_code", 'MA') "zip_code"
1631   -- PARAMETERS
1632   --   miss_table_name                - definition table
1633   --   miss_schema_name               - definition table schema name
1634   -- RETURNS
1635   --   None
1636   -- EXCEPTIONS
1637   --   None
1638   -- NOTES
1639   --   Note, that {col} is case sensitive since it generates quoted
1640   --   identifiers.  When there are multiple entries in the xform defintion
1641   --   table for the same {col} the behavior is undefined.  Any one of the
1642   --   definitions may be used in query generation.
1643   -----------------------------------------------------------------------------
1644   PROCEDURE create_miss_cat(
1645     miss_table_name                VARCHAR2,
1646     miss_schema_name               VARCHAR2 DEFAULT NULL);
1647 
1648   -----------------------------------------------------------------------------
1649   --                           insert_miss_cat_mode                          --
1650   -----------------------------------------------------------------------------
1651   -- NAME
1652   --   insert_miss_cat_mode - INSERT into CATegorical MISSining value
1653   --                          treatment MODE
1654   -- DESCRIPTION
1655   --   For every VARCHAR2, CHAR column in the data table that is not in the
1656   --   exclusion list finds missing value treatment definition and inserts it
1657   --   into the definition table. Definition for each relevant column is
1658   --   computed based on the mode value that is computed from the data table:
1659   --       val = mode
1660   -- PARAMETERS
1661   --   miss_table_name                - definition table
1662   --   data_table_name                - data table
1663   --   exclude_list                   - column exclusion list
1664   --   miss_schema_name               - definition table schema name
1665   --   data_schema_name               - data table schema name
1666   -- RETURNS
1667   --   None
1668   -- EXCEPTIONS
1669   --   None
1670   -- NOTES
1671   --   Columns with all NULLs are ignored.
1672   -----------------------------------------------------------------------------
1673   PROCEDURE insert_miss_cat_mode(
1674     miss_table_name                VARCHAR2,
1675     data_table_name                VARCHAR2,
1676     exclude_list                   COLUMN_LIST DEFAULT NULL,
1680   -----------------------------------------------------------------------------
1677     miss_schema_name               VARCHAR2 DEFAULT NULL,
1678     data_schema_name               VARCHAR2 DEFAULT NULL);
1679 
1681   --                              xform_miss_cat                             --
1682   -----------------------------------------------------------------------------
1683   -- NAME
1684   --   xform_miss_cat - XFORMation CATegorical MISSing value treatment
1685   -- DESCRIPTION
1686   --   Creates a view that perfoms categorical missing value treatment of the
1687   --   data table. Only the columns that are specified in the xform definition
1688   --   are treated, the remaining columns do not change.
1689   -- PARAMETERS
1690   --   miss_table_name                - xform definition table
1691   --   data_table_name                - data table
1692   --   xform_view_name                - xform view
1693   --   miss_schema_name               - xform definition table schema name
1694   --   data_schema_name               - data table schema name
1695   --   xform_schema_name              - xform view schema name
1696   -- RETURNS
1697   --   None
1698   -- EXCEPTIONS
1699   --   None
1700   -- NOTES
1701   --   The data type of the xformed columns is preseved by putting a CAST
1702   --   expression around NVL. For example, when col = 'zip_code', val = 'MA'
1703   --   the data type is CHAR(2) the following expression is generated:
1704   --       CAST(NVL("zip_code", 'MA') AS CHAR(2)) "zip_code"
1705   -----------------------------------------------------------------------------
1706   PROCEDURE xform_miss_cat(
1707     miss_table_name                VARCHAR2,
1708     data_table_name                VARCHAR2,
1709     xform_view_name                VARCHAR2,
1710     miss_schema_name               VARCHAR2 DEFAULT NULL,
1711     data_schema_name               VARCHAR2 DEFAULT NULL,
1712     xform_schema_name              VARCHAR2 DEFAULT NULL);
1713 
1714   -----------------------------------------------------------------------------
1715   --                              stack_miss_cat                             --
1716   -----------------------------------------------------------------------------
1717   -- NAME
1718   --   stack_miss_cat - STACK CATegorical MISSing value treatment
1719   -- DESCRIPTION
1720   --   For every column in the stack definition that has a matching entry in
1721   --   the transformation definition, stacks categorical missing value
1722   --   treatment expression on top of the existing expression and updates the
1723   --   stack definition. Columns in the stack definition that have no matching
1724   --   entries in the transformation definition or have NULL expression are
1725   --   not changed. Columns in the transformation definition that have no
1726   --   matching entries in the stack definition are added to the stack
1727   --   definition (using "{col}" in place of the original expression or "VALUE"
1728   --   for nested attributes). Reverse expressions in the stack definition are
1729   --   updated accordingly, that is if an expression is updated, added or
1730   --   remains unchanged then a corresponding reverse expression is also
1731   --   updated, added or remains unchanged. When reverse expression is NULL it
1732   --   is treated as identity function ("{attr}" or "VALUE" for nested
1733   --   attributes). Given an entry ({attr}, {expr}, {rexp}) in the stack
1734   --   definition and a matching entry in the transformation definition, after
1735   --   stacking {expr} has the following form:
1736   --       NVL({expr}, {val})
1737   --   and {rexp} remains unchanged
1738   --   Example 1. Given transformation definition:
1739   --       {col = COL1, val = 'ma'}
1740   --       {col = COL2, val = 'CA'}
1741   --   and stack definition:
1742   --       {attr = COL1, expr = lower(COL1), rexp = initcap(COL1)}
1743   --       {attr = COL3, expr = upper(COL3), rexp = initcap(COL3)}
1744   --   the following updated stack definition is generated:
1745   --       {attr = COL1,
1746   --        expr = NVL(lower(COL1), 'ma'),
1747   --        rexp = initcap(COL1)}
1748   --       {attr = COL3,
1749   --        expr = upper(COL3),
1750   --        rexp = initcap(COL3)}
1751   --       {attr = COL2,
1752   --        expr = NVL(COL2, 'CA'),
1753   --        rexp = NULL}
1754   -- PARAMETERS
1755   --   miss_table_name                - xform definition table
1756   --   xform_list                     - stack definition
1757   --   miss_schema_name               - xform definition table schema name
1758   -- RETURNS
1759   --   xform_list                     - updated stack definition
1760   -- EXCEPTIONS
1761   --   None
1762   -- NOTES
1763   --   None
1764   -----------------------------------------------------------------------------
1765   PROCEDURE stack_miss_cat(
1766     miss_table_name                VARCHAR2,
1767     xform_list                     IN OUT NOCOPY TRANSFORM_LIST,
1768     miss_schema_name               VARCHAR2 DEFAULT NULL);
1769 
1770   -----------------------------------------------------------------------------
1771   --                              xform_stack                                --
1772   -----------------------------------------------------------------------------
1773   -- NAME
1774   --   xform_stack - XFORMation STACK
1775   -- DESCRIPTION
1776   --   Creates a view that perfoms transformation of the data table specified
1777   --   by the stack definition. Only the columns that are specified in the
1778   --   stack definition are transformed, the remaining columns do not change.
1779   --   All columns in the stack definition are applied. Columns with NULL value
1780   --   in the expression field are excluded. Columns in the stack definition
1784   --   data_table_name                - data table
1781   --   that do not have a matching column in the data are added to the view.
1782   -- PARAMETERS
1783   --   xform_list                     - stack definition
1785   --   xform_view_name                - xform view
1786   --   data_schema_name               - data table schema name
1787   --   xform_schema_name              - xform view schema name
1788   -- RETURNS
1789   --   None
1790   -- EXCEPTIONS
1791   --   None
1792   -- NOTES
1793   --   None
1794   -----------------------------------------------------------------------------
1795   PROCEDURE xform_stack(
1796     xform_list                     TRANSFORM_LIST,
1797     data_table_name                VARCHAR2,
1798     xform_view_name                VARCHAR2,
1799     data_schema_name               VARCHAR2 DEFAULT NULL,
1800     xform_schema_name              VARCHAR2 DEFAULT NULL);
1801 
1802   -----------------------------------------------------------------------------
1803   --                             describe_stack                              --
1804   -----------------------------------------------------------------------------
1805   -- NAME
1806   --   describe_stack - DESCRIBE STACK
1807   -- DESCRIPTION
1808   --   Desribes transformation of the data table specified by the stack
1809   --   definition. Only the columns that are specified in the stack definition
1810   --   are transformed, the remaining columns do not change. All columns in the
1811   --   stack definition are applied. Columns with NULL value in the expression
1812   --   field are excluded. Columns in the stack definition that do not have a
1813   --   matching column in the data are added to the describe list.
1814   -- PARAMETERS
1815   --   xform_list                     - stack definition
1816   --   data_table_name                - data table
1817   --   describe_list                  - describe list/describe list3
1818   --   data_schema_name               - data table schema name
1819   -- RETURNS
1820   --   None
1821   -- EXCEPTIONS
1822   --   None
1823   -- NOTES
1824   --   When the DESCRIBE_LIST type is specified for the describe_list parameter
1825   --   a DM_NESTED_NUMERICALS type is indicated by a value of nest_num_col_type
1826   --   in the col_type field and nest_cat_col_type for DM_NESTED_CATEGORICALS.
1827   --
1828   --   When the DESCRIBE_LIST3 type is specified for the describe_list
1829   --   parameter, nested types are indicated with the ADT type in the col_type
1830   --   field with the specific nested type indicated in the col_name_type
1831   --   field as a string.  This behavior is consistent with the call to
1832   --   describe_columns3 in the dbms_sql package (dbmssql.sql).
1833   -----------------------------------------------------------------------------
1834   PROCEDURE describe_stack(
1835     xform_list                     TRANSFORM_LIST,
1836     data_table_name                VARCHAR2,
1837     describe_list                  OUT DESCRIBE_LIST,
1838     data_schema_name               VARCHAR2 DEFAULT NULL);
1839 
1840   PROCEDURE describe_stack(
1841     xform_list                     TRANSFORM_LIST,
1842     data_table_name                VARCHAR2,
1843     describe_list                  OUT DESCRIBE_LIST3,
1844     data_schema_name               VARCHAR2 DEFAULT NULL);
1845 
1846   -----------------------------------------------------------------------------
1847   --                             set_expression                              --
1848   -----------------------------------------------------------------------------
1849   -- NAME
1850   --   set_expression - SET xform EXPRESSION
1851   -- DESCRIPTION
1852   --   Appends a VARCHAR2 chunk to the expression. Appeding NULL clears the
1853   --   expression.
1854   -- PARAMETERS
1855   --   expression                     - expression
1856   --   chunk                          - VARCHAR2 chunk
1857   -- RETURNS
1858   --   None
1859   -- EXCEPTIONS
1860   --   None
1861   -- NOTES
1862   --   None
1863   -----------------------------------------------------------------------------
1864   PROCEDURE set_expression(
1865     expression                     IN OUT NOCOPY EXPRESSION_REC,
1866     chunk                          VARCHAR2 DEFAULT NULL);
1867 
1868   -----------------------------------------------------------------------------
1869   --                             get_expression                              --
1870   -----------------------------------------------------------------------------
1871   -- NAME
1872   --   get_expression - GET xform EXPRESSION
1873   -- DESCRIPTION
1874   --   Gets a VARCHAR2 chunk from the expression. Chunks numbering starts from
1875   --   one. For chunks outside of the range the return value is NULL. When
1876   --   chunk number is NULL the whole expression is returned as a string. If
1877   --   expression is too big VALUE_ERROR is raised.
1878   -- PARAMETERS
1879   --   expression                     - expression
1880   --   chunk_num                      - chunk number
1881   -- RETURNS
1882   --   VARCHAR2 chunk.
1883   -- EXCEPTIONS
1884   --   VALUE_ERROR
1885   -- NOTES
1886   --   None
1887   -----------------------------------------------------------------------------
1888   FUNCTION get_expression(
1889     expression                     EXPRESSION_REC,
1890     chunk_num                      PLS_INTEGER DEFAULT NULL)
1891   RETURN VARCHAR2;
1892 
1893   -----------------------------------------------------------------------------
1894   --                             set_transform                               --
1895   -----------------------------------------------------------------------------
1896   -- NAME
1897   --   set_transform - SET TRANSFORM list
1898   -- DESCRIPTION
1899   --   Appends an element to the transformation list.
1900   -- PARAMETERS
1901   --   expression                     - expression
1902   --   chunk                          - VARCHAR2 chunk
1903   -- RETURNS
1904   --   None
1905   -- EXCEPTIONS
1906   --   None
1907   -- NOTES
1908   --   None
1909   -----------------------------------------------------------------------------
1910   PROCEDURE set_transform(
1911     xform_list                     IN OUT NOCOPY TRANSFORM_LIST,
1912     attribute_name                 VARCHAR2,
1913     attribute_subname              VARCHAR2,
1914     expression                     CLOB,
1915     reverse_expression             VARCHAR2,
1916     attribute_spec                 VARCHAR2 DEFAULT NULL);
1917 END dbms_data_mining_transform;