1 PACKAGE dbms_data_mining_transform AUTHID CURRENT_USER AS
2 -----------------------------------------------------------------------------
3 -- COLUMN LIST collection type
4 -----------------------------------------------------------------------------
5 -- This type is used to store both quoted and non-quoted identifiers for
6 -- column names of a table.
7 TYPE
8 COLUMN_LIST IS VARRAY(1000) OF VARCHAR2(32);
9
10 -----------------------------------------------------------------------------
11 -- DESCribe LIST collection TYPE
12 -----------------------------------------------------------------------------
13 -- This type holds the results of describe operations. Note that col_name
14 -- field of DESC_TAB2 is VARCHAR2(32767). This is applicable to cases when a
15 -- SELECT expression is not given an alias and thus the expression itself is
16 -- used as name (with whitespaces removed) and thus can be larger than 30
17 -- bytes. However when those names get larger than 30 bytes the corresponding
18 -- SELECT cannot be used as an inline view.
19 SUBTYPE
20 DESCRIBE_LIST IS dbms_sql.DESC_TAB2;
21
22 SUBTYPE
23 DESCRIBE_LIST3 IS dbms_sql.DESC_TAB3;
24
25 -----------------------------------------------------------------------------
26 -- EXPRESSION RECord type
27 -----------------------------------------------------------------------------
28 -- This type is used for storing transformation expressions. Unlike VARCHAR2
29 -- it can be used for expression that can grow larger than 32K. Field lb is
30 -- the smallest index and ub is the largest index of the elements of the
31 -- VARCHAR2A array.
32 TYPE
33 EXPRESSION_REC IS RECORD (
34 lstmt dbms_sql.VARCHAR2A,
35 lb BINARY_INTEGER DEFAULT 1,
36 ub BINARY_INTEGER DEFAULT 0);
37
38 -----------------------------------------------------------------------------
39 -- TRAMSFORMation RECord type
40 -----------------------------------------------------------------------------
41 -- This type is used for storing in-memory transformation stack definition
42 -- for a single attribute.
43 -- Attribute specification field is used to specify additional information
44 -- and actions for an attribute:
45 -- NOPREP - disables data prep for an attribute.
46 -- This is applicable to both stack_* interfaces in this package
47 -- as well as dbms_data_mining.create_model. Stack_* methods do
48 -- not do any stacking for attributes with NOPREP specified, while
49 -- create_model does not do any auto data prep even when the
50 -- global PREP_AUTO setting is set to ON.
51 TYPE
52 TRANSFORM_REC IS RECORD (
53 attribute_name VARCHAR2(30),
54 attribute_subname VARCHAR2(4000),
55 expression EXPRESSION_REC,
56 reverse_expression EXPRESSION_REC,
57 attribute_spec VARCHAR2(4000));
58
59 -----------------------------------------------------------------------------
60 -- TRAMSFORMation LIST collection type
61 -----------------------------------------------------------------------------
62 -- This type is used for storing in-memory transformation stack definition.
63 TYPE
64 TRANSFORM_LIST IS TABLE OF TRANSFORM_REC;
65
66 -----------------------------------------------------------------------------
67 -- NESTed COLumn TYPEs
68 -----------------------------------------------------------------------------
69 nest_num_col_type CONSTANT NUMBER := 100001;
70 nest_cat_col_type CONSTANT NUMBER := 100002;
71 nest_bd_col_type CONSTANT NUMBER := 100003;
72 nest_bf_col_type CONSTANT NUMBER := 100004;
73
74 nest_nums_col_name CONSTANT VARCHAR2(30) := 'DM_NESTED_NUMERICALS';
75 nest_cats_col_name CONSTANT VARCHAR2(30) := 'DM_NESTED_CATEGORICALS';
76 nest_bds_col_name CONSTANT VARCHAR2(30) := 'DM_NESTED_BINARY_DOUBLES';
77 nest_bfs_col_name CONSTANT VARCHAR2(30) := 'DM_NESTED_BINARY_FLOATS';
78
79 -----------------------------------------------------------------------------
80 -- create_col_rem --
81 -----------------------------------------------------------------------------
82 -- NAME
83 -- create_col_rem - CREATE COLumn REMoval definition table
84 -- DESCRIPTION
85 -- Creates column removal definition table:
86 -- CREATE TABLE <col_rem>(
87 -- col VARCHAR2(30),
88 -- att VARCHAR2(4000))
89 -- This table is used to guide query generation process that removes
90 -- specified columns.
91 -- PARAMETERS
92 -- rem_table_name - column removal definition table
93 -- rem_schema_name - definition table schema name
94 -- RETURNS
95 -- None
96 -- EXCEPTIONS
97 -- None
98 -- NOTES
99 -- Note, that {col} is case sensitive since it generates quoted
100 -- identifiers. It is allowed to have multiple entries in the xform
101 -- defintion table for the same {col}.
102 -----------------------------------------------------------------------------
103 PROCEDURE create_col_rem(
104 rem_table_name VARCHAR2,
105 rem_schema_name VARCHAR2 DEFAULT NULL);
106
107 -----------------------------------------------------------------------------
108 -- xform_col_rem --
109 -----------------------------------------------------------------------------
110 -- NAME
111 -- xform_col_rem - XFORMation COLumn REMoval
112 -- DESCRIPTION
113 -- Creates a view that perfoms column removal from the data table. Only the
114 -- columns that are specified in the xform definition are removed, the
115 -- remaining columns do not change.
116 -- PARAMETERS
117 -- rem_table_name - xform definition table
118 -- data_table_name - data table
119 -- xform_view_name - xform view
120 -- rem_schema_name - xform definition table schema name
121 -- data_schema_name - data table schema name
122 -- xform_schema_name - xform view schema name
123 -- RETURNS
124 -- None
125 -- EXCEPTIONS
126 -- None
127 -- NOTES
128 -- None
129 -----------------------------------------------------------------------------
130 PROCEDURE xform_col_rem(
131 rem_table_name VARCHAR2,
132 data_table_name VARCHAR2,
133 xform_view_name VARCHAR2,
134 rem_schema_name VARCHAR2 DEFAULT NULL,
135 data_schema_name VARCHAR2 DEFAULT NULL,
136 xform_schema_name VARCHAR2 DEFAULT NULL);
137
138 -----------------------------------------------------------------------------
139 -- stack_col_rem --
140 -----------------------------------------------------------------------------
141 -- NAME
142 -- stack_col_rem - STACK COLumn REMoval
143 -- DESCRIPTION
144 -- For every column in the stack definition that has a matching entry in
145 -- the transformation definition, stacks column removal expression (NULL
146 -- expression) instead of the existing expression and updates the stack
147 -- definition. Columns in the stack definition that have no matching
148 -- entries in the transformation definition or have NULL expression are not
149 -- changed. Columns in the transformation definition that have no matching
150 -- entries in the stack definition are added to the stack definition (with
151 -- NULL expression). Reverse expressions in the stack definition are
152 -- updated accordingly, that is if an expression is updated, added or
153 -- remains unchanged then a corresponding reverse expression is also
154 -- updated, added or remains unchanged. When reverse expression is NULL it
155 -- is treated as "{attr}" (identity function) or "VALUE" for nested
156 -- columns. Given an entry ({attr}, {expr}, {rexp}) in the stack definition
157 -- and a matching entry in the transformation definition, after stacking
158 -- {expr} and {rexp} are both changed to NULL.
159 -- Example 1. Given transformation definition:
160 -- {col = COL1, att = NULL}
161 -- {col = COL2, att = NULL}
162 -- and stack definition:
163 -- {attr = COL1, expr = log(10, COL1), rexp = power(10, COL1)}
164 -- {attr = COL3, expr = ln(COL3), rexp = exp(COL3)}
165 -- the following updated stack definition is generated:
166 -- {attr = COL1, expr = NULL, rexp = NULL}
167 -- {attr = COL3, expr = ln(COL3), rexp = exp(COL3)}
168 -- {attr = COL2, expr = NULL, rexp = NULL}
169 -- PARAMETERS
170 -- rem_table_name - xform definition table
171 -- xform_list - stack definition
172 -- rem_schema_name - xform definition table schema name
173 -- RETURNS
174 -- xform_list - updated stack definition
175 -- EXCEPTIONS
176 -- None
177 -- NOTES
178 -- None
179 -----------------------------------------------------------------------------
180 PROCEDURE stack_col_rem(
181 rem_table_name VARCHAR2,
182 xform_list IN OUT NOCOPY TRANSFORM_LIST,
183 rem_schema_name VARCHAR2 DEFAULT NULL);
184
185 -----------------------------------------------------------------------------
186 -- create_norm_lin --
187 -----------------------------------------------------------------------------
188 -- NAME
189 -- create_norm_lin - CREATE LINear NORMalization definition table
190 -- DESCRIPTION
191 -- Creates linear normalization definition table:
192 -- CREATE TABLE <norm_lin>(
193 -- col VARCHAR2(30),
194 -- att VARCHAR2(4000),
195 -- shift NUMBER,
196 -- scale NUMBER)
197 -- This table is used to guide query generation process to construct
198 -- linear normalization expressions of the following form:
199 -- ("{col}" - {shift})/{scale} "{col}"
200 -- For example when col = 'my_col', shift = -1.5 and scale = 20 the
201 -- following expression is generated:
202 -- ("my_col" - (-1.5))/20 "my_col"
203 -- PARAMETERS
204 -- norm_table_name - linear normalization definition table
205 -- norm_schema_name - definition table schema name
206 -- RETURNS
207 -- None
208 -- EXCEPTIONS
209 -- None
210 -- NOTES
211 -- Note, that {col} is case sensitive since it generates quoted
212 -- identifiers. When there are multiple entries in the xform defintion
213 -- table for the same {col} the behavior is undefined. Any one of the
214 -- definitions may be used in query generation. NULL values remain
215 -- unchanged.
216 -----------------------------------------------------------------------------
217 PROCEDURE create_norm_lin(
218 norm_table_name VARCHAR2,
219 norm_schema_name VARCHAR2 DEFAULT NULL);
220
221 -----------------------------------------------------------------------------
222 -- insert_norm_lin_minmax --
223 -----------------------------------------------------------------------------
224 -- NAME
225 -- insert_norm_lin_minmax - INSERT into LINear NORMalization MINMAX
226 -- DESCRIPTION
227 -- For every NUMBER column in the data table that is not in the exclusion
228 -- list finds normalization definition and inserts it into the definition
229 -- table. Definition for each relevant column is computed based on the min
230 -- and max values that are computed from the data table:
231 -- shift = min
232 -- scale = max - min
233 -- The values of shift and scale are rounded to round_num significant
234 -- digits prior to storing them in the definition table.
235 -- PARAMETERS
236 -- norm_table_name - linear normalization definition table
237 -- data_table_name - data table
238 -- exclude_list - column exclusion list
239 -- round_num - number of significant digits
240 -- norm_schema_name - definition table schema name
241 -- data_schema_name - data table schema name
242 -- RETURNS
243 -- None
244 -- EXCEPTIONS
245 -- None
246 -- NOTES
247 -- Columns with all NULLs or only one unique value are ignored.
248 -----------------------------------------------------------------------------
249 PROCEDURE insert_norm_lin_minmax(
250 norm_table_name VARCHAR2,
251 data_table_name VARCHAR2,
252 exclude_list COLUMN_LIST DEFAULT NULL,
253 round_num PLS_INTEGER DEFAULT 6,
254 norm_schema_name VARCHAR2 DEFAULT NULL,
255 data_schema_name VARCHAR2 DEFAULT NULL);
256
257 -----------------------------------------------------------------------------
258 -- insert_norm_lin_scale --
259 -----------------------------------------------------------------------------
260 -- NAME
261 -- insert_norm_lin_scale - INSERT into LINear NORMalization SCALE
262 -- DESCRIPTION
263 -- For every NUMBER column in the data table that is not in the exclusion
264 -- list finds normalization definition and inserts it into the definition
265 -- table. Definition for each relevant column is computed based on the min
266 -- and max values that are computed from the data table:
267 -- shift = 0
268 -- scale = greatest(abs(max), abs(min))
269 -- The value of scale is rounded to round_num significant digits prior to
270 -- storing it in the definition table.
271 -- PARAMETERS
272 -- norm_table_name - linear normalization definition table
273 -- data_table_name - data table
274 -- exclude_list - column exclusion list
275 -- round_num - number of significant digits
276 -- norm_schema_name - definition table schema name
277 -- data_schema_name - data table schema name
278 -- RETURNS
279 -- None
280 -- EXCEPTIONS
281 -- None
282 -- NOTES
283 -- Columns with all NULLs or all zeros are ignored.
284 -----------------------------------------------------------------------------
285 PROCEDURE insert_norm_lin_scale(
286 norm_table_name VARCHAR2,
287 data_table_name VARCHAR2,
288 exclude_list COLUMN_LIST DEFAULT NULL,
289 round_num PLS_INTEGER DEFAULT 6,
290 norm_schema_name VARCHAR2 DEFAULT NULL,
291 data_schema_name VARCHAR2 DEFAULT NULL);
292
293 -----------------------------------------------------------------------------
294 -- insert_norm_lin_zscore --
295 -----------------------------------------------------------------------------
299 -- For every NUMBER column in the data table that is not in the exclusion
296 -- NAME
297 -- insert_norm_lin_zscore - INSERT into LINear NORMalization Z-SCORE
298 -- DESCRIPTION
300 -- list finds normalization definition and inserts it into the definition
301 -- table. Definition for each relevant column is computed based on the
302 -- mean and standard deviation that are estimated from the data table:
303 -- shift = mean
304 -- scale = stddev
305 -- The values of shift and scale are rounded to round_num significant
306 -- digits prior to storing them in the definition table.
307 -- PARAMETERS
308 -- norm_table_name - linear normalization definition table
309 -- data_table_name - data table
310 -- exclude_list - column exclusion list
311 -- round_num - number of significant digits
312 -- norm_schema_name - definition table schema name
313 -- data_schema_name - data table schema name
314 -- RETURNS
315 -- None
316 -- EXCEPTIONS
317 -- None
318 -- NOTES
319 -- Columns with all NULLs or only one unique value are ignored.
320 -----------------------------------------------------------------------------
321 PROCEDURE insert_norm_lin_zscore(
322 norm_table_name VARCHAR2,
323 data_table_name VARCHAR2,
324 exclude_list COLUMN_LIST DEFAULT NULL,
325 round_num PLS_INTEGER DEFAULT 6,
326 norm_schema_name VARCHAR2 DEFAULT NULL,
327 data_schema_name VARCHAR2 DEFAULT NULL);
328
329 -----------------------------------------------------------------------------
330 -- xform_norm_lin --
331 -----------------------------------------------------------------------------
332 -- NAME
333 -- xform_norm_lin - XFORMation LINear NORMalization
334 -- DESCRIPTION
335 -- Creates a view that perfoms linear normalization of the data table
336 -- Only the columns that are specified in the xform definition are
337 -- normalized, the remaining columns do not change.
338 -- PARAMETERS
339 -- norm_table_name - xform definition table
340 -- data_table_name - data table
341 -- xform_view_name - xform view
342 -- norm_schema_name - xform definition table schema name
343 -- data_schema_name - data table schema name
344 -- xform_schema_name - xform view schema name
345 -- RETURNS
346 -- None
347 -- EXCEPTIONS
348 -- None
349 -- NOTES
350 -- None
351 -----------------------------------------------------------------------------
352 PROCEDURE xform_norm_lin(
353 norm_table_name VARCHAR2,
354 data_table_name VARCHAR2,
355 xform_view_name VARCHAR2,
356 norm_schema_name VARCHAR2 DEFAULT NULL,
357 data_schema_name VARCHAR2 DEFAULT NULL,
358 xform_schema_name VARCHAR2 DEFAULT NULL);
359
360 -----------------------------------------------------------------------------
361 -- stack_norm_lin --
362 -----------------------------------------------------------------------------
363 -- NAME
364 -- stack_norm_lin - STACK LINear NORMalization
365 -- DESCRIPTION
366 -- For every column in the stack definition that has a matching entry in
367 -- the transformation definition, stacks linear normalization expression on
368 -- top of the existing expression and updates the stack definition.
369 -- Columns in the stack definition that have no matching entries in the
370 -- transformation definition or have NULL expression are not changed.
371 -- Columns in the transformation definition that have no matching entries
372 -- in the stack definition are added to the stack definition (using "{col}"
373 -- in place of the original expression or "VALUE" for nested attributes).
374 -- Reverse expressions in the stack definition are updated accordingly,
375 -- that is if an expression is updated, added or remains unchanged then a
376 -- corresponding reverse expression is also updated, added or remains
377 -- unchanged. When reverse expression is NULL it is treated as identity
378 -- function ("{attr}" or "VALUE" for nested attributes).
379 -- Given an entry ({attr}, {expr}, {rexp}) in the stack definition and a
380 -- matching entry in the transformation definition, after stacking {expr}
381 -- has the following form:
382 -- ({expr} - {shift})/{scale}
383 -- and {rexp} maintains the following form with every occurance of {attr}
384 -- replaced with:
385 -- {attr}*{scale} + {shift}
386 -- Example 1. Given transformation definition:
387 -- {col = COL1, shift = -1.5, scale = 20}
388 -- {col = COL2, shift = 0, scale = 10}
389 -- and stack definition:
390 -- {attr = COL1, expr = log(10, COL1), rexp = power(10, COL1)}
391 -- {attr = COL3, expr = ln(COL3), rexp = exp(COL3)}
392 -- the following updated stack definition is generated:
393 -- {attr = COL1,
394 -- expr = (log(10, COL1) - (-1.5)) / 20,
395 -- rexp = power(10, COL1*20 + (-1.5))}
396 -- {attr = COL3,
397 -- expr = ln(COL3),
398 -- rexp = exp(COL3)}
399 -- {attr = COL2,
400 -- expr = (COL2 - 0) / 10,
401 -- rexp = COL2*10 + 0}
402 -- PARAMETERS
403 -- norm_table_name - xform definition table
407 -- xform_list - updated stack definition
404 -- xform_list - stack definition
405 -- norm_schema_name - xform definition table schema name
406 -- RETURNS
408 -- EXCEPTIONS
409 -- None
410 -- NOTES
411 -- None
412 -----------------------------------------------------------------------------
413 PROCEDURE stack_norm_lin(
414 norm_table_name VARCHAR2,
415 xform_list IN OUT NOCOPY TRANSFORM_LIST,
416 norm_schema_name VARCHAR2 DEFAULT NULL);
417
418 -----------------------------------------------------------------------------
419 -- create_bin_num --
420 -----------------------------------------------------------------------------
421 -- NAME
422 -- create_bin_num - CREATE NUMerical BINning definition table
423 -- DESCRIPTION
424 -- Creates numerical binning definition table:
425 -- CREATE TABLE <bin_num>(
426 -- col VARCHAR2(30),
427 -- att VARCHAR2(4000),
428 -- val NUMBER,
429 -- bin VARCHAR2(4000))
430 -- This table is used to guide query generation process to construct
431 -- numerical binning expressions of the following form:
432 -- CASE WHEN "{col}" < {val0} THEN '{bin0}'
433 -- WHEN "{col}" <= {val1} THEN '{bin1}'
434 -- ...
435 -- WHEN "{col}" <= {valN} THEN '{binN}'
436 -- WHEN "{col}" IS NOT NULL THEN '{bin(N+1)}'
437 -- END "{col}"
438 -- This expression maps values in the range [{val0};{valN}] into N bins
439 -- {bin1}, ..., {binN}, values outside of this range into {bin0} or
440 -- {bin(N+1)}, such that
441 -- (-inf; {val0}) -> {bin0}
442 -- [{val0}; {val1}] -> {bin1}
443 -- ...
444 -- ({val(N-1)}; {valN}] -> {binN}
445 -- ({valN}; +inf) -> {bin(N+1)}.
446 -- NULL values remain unchanged. {bin(N+1)} is optional. If it is not
447 -- specified the values ("{col}" > {valN}) are mapped to NULL. To specify
448 -- {bin(N+1)} provide a row with {val} = NULL. The order of the WHEN ..
449 -- THEN pairs is based on the ascending order of {val} for a given {col}.
450 -- Example 1. <bin_num> contains four rows with {col} = 'mycol':
451 -- {col = 'mycol', val = 15.5, bin = 'small'}
452 -- {col = 'mycol', val = 10, bin = 'tiny'}
453 -- {col = 'mycol', val = 20, bin = 'large'}
454 -- {col = 'mycol', val = NULL, bin = 'huge'}
455 -- the following expression is generated:
456 -- CASE WHEN "mycol" < 10 THEN 'tiny'
457 -- WHEN "mycol" <= 15.5 THEN 'small'
458 -- WHEN "mycol" <= 20 THEN 'large'
459 -- WHEN "mycol" IS NOT NULL THEN 'huge'
460 -- END "mycol"
461 -- Example 2. <bin_num> contains three rows with {col} = 'mycol':
462 -- {col = 'mycol', val = 15.5, bin = NULL}
463 -- {col = 'mycol', val = 10, bin = 'tiny'}
464 -- {col = 'mycol', val = 20, bin = 'large'}
465 -- the following expression is generated:
466 -- CASE WHEN "mycol" < 10 THEN NULL
467 -- WHEN "mycol" <= 15.5 THEN 'small'
468 -- WHEN "mycol" <= 20 THEN 'large'
469 -- END "mycol"
470 -- PARAMETERS
471 -- bin_table_name - numerical binning definition table
472 -- bin_schema_name - definition table schema name
473 -- RETURNS
474 -- None
475 -- EXCEPTIONS
476 -- None
477 -- NOTES
478 -- Note, that {col} is case sensitive since it generates quoted
479 -- identifiers. In cases when there are multiple entries with the same
480 -- {col}, {val} combiniation with different {bin} the behavior is
481 -- undefined. Any one of the {bin} might be used. The maximum number of
482 -- arguments in a CASE expression is 255, and each WHEN ... THEN pair
483 -- counts as two arguments.
484 -----------------------------------------------------------------------------
485 PROCEDURE create_bin_num(
486 bin_table_name VARCHAR2,
487 bin_schema_name VARCHAR2 DEFAULT NULL);
488
489 -----------------------------------------------------------------------------
490 -- insert_bin_num_eqwidth --
491 -----------------------------------------------------------------------------
492 -- NAME
493 -- insert_bin_num_eqwidth - INSERT into NUMerical BINning EQual-WIDTH
494 -- DESCRIPTION
495 -- For every NUMBER column in the data table that is not in the exclusion
496 -- list finds numerical binning definition and inserts it into the
497 -- definition table. Definition for each relevant column is computed based
498 -- on the min and max values that are computed from the data table. Each
499 -- of the N (p_bin_num) bins {bin1}, ..., {binN} span ranges of equal
500 -- width
501 -- inc = (max - min) / N
502 -- where {binI} = I when N > 0 or {binI} = N+1-I when N < 0, and
503 -- {bin0} = {bin(N+1)} = NULL. For example, when N=2, col='mycol', min=10,
504 -- and max = 21, the following three rows are inserted into the
505 -- definition table (inc = 5.5):
506 -- COL VAL BIN
507 -- ----- ----- -----
508 -- mycol 10 NULL
509 -- mycol 15.5 1
510 -- mycol 21 2
511 -- The values of {val} are rounded to round_num significant digits prior
515 -- data_table_name - data table
512 -- to storing them in the definition table.
513 -- PARAMETERS
514 -- bin_table_name - numerical binning definition table
516 -- bin_num - number of bins
517 -- exclude_list - column exclusion list
518 -- round_num - number of significant digits
519 -- bin_schema_name - definition table schema name
520 -- data_schema_name - data table schema name
521 -- RETURNS
522 -- None
523 -- EXCEPTIONS
524 -- None
525 -- NOTES
526 -- Columns with all NULLs or only one unique value are ignored. Nothing
527 -- is done when bin_num IS NULL or bin_num = 0.
528 -----------------------------------------------------------------------------
529 PROCEDURE insert_bin_num_eqwidth(
530 bin_table_name VARCHAR2,
531 data_table_name VARCHAR2,
532 bin_num PLS_INTEGER DEFAULT 10,
533 exclude_list COLUMN_LIST DEFAULT NULL,
534 round_num PLS_INTEGER DEFAULT 6,
535 bin_schema_name VARCHAR2 DEFAULT NULL,
536 data_schema_name VARCHAR2 DEFAULT NULL);
537
538 -----------------------------------------------------------------------------
539 -- insert_bin_num_qtile --
540 -----------------------------------------------------------------------------
541 -- NAME
542 -- insert_bin_num_qtile - INSERT into NUMerical BINning QuanTILE
543 -- DESCRIPTION
544 -- For every NUMBER column in the data table that is not in the exclusion
545 -- list finds numerical binning definition and inserts it into the
546 -- definition table. Definition for each relevant column is computed based
547 -- on the min values per quantile, where quantiles are computed from the
548 -- data using NTILE function. Bin {bin1} spans range [min(1), min(2)],
549 -- bins {bin2}, ..., {bin(N-1)} span ranges (min(I), min(I+1)] and {binN}
550 -- range (min(N), max(N)] with {binI} = I when N > 0 or {binI} = N+1-I
551 -- when N < 0, and {bin0}={bin(N+1)} = NULL. Bins with equal left and
552 -- right boundaries are collapsed. For example, when N=4, col='mycol',
553 -- and data is {1,2,2,2,2,3,4}, the following three rows are inserted into
554 -- the definition table:
555 -- COL VAL BIN
556 -- ----- ----- -----
557 -- mycol 1 NULL
558 -- mycol 2 1
559 -- mycol 4 2
560 -- Here quantiles are {1,2}, {2,2}, {2,3}, {4} and min(1) = 1, min(2) = 2,
561 -- min(3) = 2, min(4) = 4, max(4) = 4, and ranges are [1,2], (2,2], (2,4],
562 -- (4,4]. After collapsing [1,2], (2,4].
563 -- PARAMETERS
564 -- bin_table_name - numerical binning definition table
565 -- data_table_name - data table
566 -- bin_num - number of bins
567 -- exclude_list - column exclusion list
568 -- bin_schema_name - definition table schema name
569 -- data_schema_name - data table schema name
570 -- RETURNS
571 -- None
572 -- EXCEPTIONS
573 -- None
574 -- NOTES
575 -- Columns with all NULLs or only one unique value are ignored. Nothing
576 -- is done when bin_num IS NULL or bin_num = 0.
577 -----------------------------------------------------------------------------
578 PROCEDURE insert_bin_num_qtile(
579 bin_table_name VARCHAR2,
580 data_table_name VARCHAR2,
581 bin_num PLS_INTEGER DEFAULT 10,
582 exclude_list COLUMN_LIST DEFAULT NULL,
583 bin_schema_name VARCHAR2 DEFAULT NULL,
584 data_schema_name VARCHAR2 DEFAULT NULL);
585
586 -----------------------------------------------------------------------------
587 -- insert_autobin_num_eqwidth --
588 -----------------------------------------------------------------------------
589 -- NAME
590 -- insert_autobin_num_eqwidth - INSERT into NUMerical BINning AUTOmated
591 -- EQual-WIDTH
592 -- DESCRIPTION
593 -- For every NUMBER column in the data table that is not in the exclusion
594 -- list finds numerical binning definition and inserts it into the
595 -- definition table. Definition for each relevant column is computed using
596 -- equal-width method (see description for insert_bin_nume_eqwidth). The
597 -- number of bins (N) is computed for each column separately and is based
598 -- on the number of non-NULL values (cnt), min and max values, and the
599 -- standard deviation (dev)
600 -- N = floor(power(cnt, 1/3)*(max - min)/(C*dev))
601 -- where C = 3.49/0.9. Parameter bin_num is used to adjust N to be at
602 -- least bin_num. No adjustment is done when bin_num is NULL or zero.
603 -- Parameter max_bin_num is used to adjust N to be at most max_bin_num.
604 -- No adjustment is done when max_bin_num is NULL or zero. For columns
605 -- with all integer values (discrete columns) N is adjusted to be at most
606 -- the maximum number of distinct values in the obseved range
607 -- max - min + 1
608 -- Parameter sample_size is used to adjust cnt to be at most sample_size.
609 -- No adjustment is done when sample_size is NULL or zero.
610 -- PARAMETERS
611 -- bin_table_name - numerical binning definition table
615 -- exclude_list - column exclusion list
612 -- data_table_name - data table
613 -- bin_num - minimum number of bins
614 -- max_bin_num - maximum number of bins
616 -- round_num - number of significant digits
617 -- sample_size - maximum size of the sample
618 -- bin_schema_name - definition table schema name
619 -- data_schema_name - data table schema name
620 -- rem_table_name - column removal definition table
621 -- rem_schema_name - removal definition table schema
622 -- RETURNS
623 -- None
624 -- EXCEPTIONS
625 -- None
626 -- NOTES
627 -- Columns with all NULLs or only one unique value are ignored. The sign
628 -- of bin_num, max_bin_num, sample_size has no effect on the result, only
629 -- the absolute values are being used. The value adjustment for N is done
630 -- in the following order: first, bin_num, then max_bin_num, and then
631 -- discrete column adjustment.
632 -- Column removal definition table is optional. If specified columns with
633 -- all NULLs or only one unique value will be inserted into the column
634 -- removal definition table.
635 -----------------------------------------------------------------------------
636 PROCEDURE insert_autobin_num_eqwidth(
637 bin_table_name VARCHAR2,
638 data_table_name VARCHAR2,
639 bin_num PLS_INTEGER DEFAULT 3,
640 max_bin_num PLS_INTEGER DEFAULT 100,
641 exclude_list COLUMN_LIST DEFAULT NULL,
642 round_num PLS_INTEGER DEFAULT 6,
643 sample_size PLS_INTEGER DEFAULT 50000,
644 bin_schema_name VARCHAR2 DEFAULT NULL,
645 data_schema_name VARCHAR2 DEFAULT NULL,
646 rem_table_name VARCHAR2 DEFAULT NULL,
647 rem_schema_name VARCHAR2 DEFAULT NULL);
648
649 -----------------------------------------------------------------------------
650 -- xform_bin_num --
651 -----------------------------------------------------------------------------
652 -- NAME
653 -- xform_bin_num - XFORMation NUMerical BINning
654 -- DESCRIPTION
655 -- Creates a view that perfoms numerical binning of the data table. Only
656 -- the columns that are specified in the xform definition are binned, the
657 -- remaining columns do not change.
658 -- PARAMETERS
659 -- bin_table_name - xform definition table
660 -- data_table_name - data table
661 -- xform_view_name - xform view
662 -- literal_flag - literal flag
663 -- bin_schema_name - xform definition table schema name
664 -- data_schema_name - data table schema name
665 -- xform_schema_name - xform view schema name
666 -- RETURNS
667 -- None
668 -- EXCEPTIONS
669 -- None
670 -- NOTES
671 -- Literal flag indicates whether the values in {bin} are valid SQL
672 -- literals. When the the flag is set to TRUE the value of {bin} is used
673 -- as is in query generation, otherwise it is converted into a valid text
674 -- literal (surround by quotes and double the quotes inside). By default
675 -- the flag is set to FALSE. One example when it can be set to TRUE is in
676 -- cases when all {bin} are numbers. In that case the xformed column will
677 -- remain numeric as opposed to textual (default behavior). For example,
678 -- for the following xfrom definition:
679 -- COL VAL BIN
680 -- ----- ----- -----
681 -- mycol 10 NULL
682 -- mycol 15.5 1
683 -- mycol 21 2
684 -- the following expression is generated when the flag is set to FALSE:
685 -- CASE WHEN "mycol" < 10 THEN NULL
686 -- WHEN "mycol" <= 15.5 THEN '1'
687 -- WHEN "mycol" <= 20 THEN '2'
688 -- END "mycol"
689 -- and when the flag is set to TRUE:
690 -- CASE WHEN "mycol" < 10 THEN NULL
691 -- WHEN "mycol" <= 15.5 THEN 1
692 -- WHEN "mycol" <= 20 THEN 2
693 -- END "mycol"
694 -----------------------------------------------------------------------------
695 PROCEDURE xform_bin_num(
696 bin_table_name VARCHAR2,
697 data_table_name VARCHAR2,
698 xform_view_name VARCHAR2,
699 literal_flag BOOLEAN DEFAULT FALSE,
700 bin_schema_name VARCHAR2 DEFAULT NULL,
701 data_schema_name VARCHAR2 DEFAULT NULL,
702 xform_schema_name VARCHAR2 DEFAULT NULL);
703
704 -----------------------------------------------------------------------------
705 -- stack_bin_num --
706 -----------------------------------------------------------------------------
707 -- NAME
708 -- stack_bin_num - STACK NUMerical BINning
709 -- DESCRIPTION
710 -- For every column in the stack definition that has a matching entry in
711 -- the transformation definition, stacks numerical binning expression on
712 -- top of the existing expression and updates the stack definition.
713 -- Columns in the stack definition that have no matching entries in the
717 -- in place of the original expression or "VALUE" for nested attributes).
714 -- transformation definition or have NULL expression are not changed.
715 -- Columns in the transformation definition that have no matching entries
716 -- in the stack definition are added to the stack definition (using "{col}"
718 -- Reverse expressions in the stack definition are updated accordingly,
719 -- that is if an expression is updated, added or remains unchanged then a
720 -- corresponding reverse expression is also updated, added or remains
721 -- unchanged. When reverse expression is NULL it is treated as identity
722 -- function ("{attr}" or "VALUE" for nested attributes).
723 -- Given an entry ({attr}, {expr}, {rexp}) in the stack definition and a
724 -- matching entry in the transformation definition, after stacking {expr}
725 -- has the following form:
726 -- CASE WHEN ({expr}) < {val0} THEN '{bin0}'
727 -- WHEN ({expr}) <= {val1} THEN '{bin1}'
728 -- ...
729 -- WHEN ({expr}) <= {valN} THEN '{binN}'
730 -- WHEN ({expr}) IS NOT NULL THEN '{bin(N+1)}'
731 -- END
732 -- and {rexp} has the following form:
733 -- DECODE("{attr}", '{bin0}', '( ; {rev0})',
734 -- '{bin1}', '[{rev0}; {rev1})',
735 -- ...
736 -- '{binN}', '[{rev(N-1)}; {revN}]',
737 -- '{bin(N+1)}', '({revN}; )',
738 -- NULL, 'NULL')
739 -- where {revI} is the result of applying reverese expression to {valI}.
740 -- If {binI} and {binJ} are equal then the corresponding entries of the
741 -- DECODE function above are merged into:
742 -- '{binI}', '[{rev(I-1)}; {revI}), [{rev(J-1)}; {revJ})'
743 -- Note that reverse expressions implicitly maps invalid bins to NULL.
744 -- Example 1. Given transformation definition:
745 -- {col = COL1, val = 0, bin = NULL}
746 -- {col = COL1, val = 1, bin = A}
747 -- {col = COL1, val = 2, bin = B}
748 -- {col = COL1, val = 3, bin = A}
749 -- {col = COL2, val = 10, bin = NULL}
750 -- {col = COL2, val = 15, bin = 1}
751 -- {col = COL2, val = 20, bin = 2}
752 -- and stack definition:
753 -- {attr = COL1, expr = log(10, COL1), rexp = power(10, COL1)}
754 -- {attr = COL3, expr = ln(COL3), rexp = exp(COL3)}
755 -- the following updated stack definition is generated:
756 -- {attr = COL1,
757 -- expr = CASE WHEN (log(10, COL1)) < 0 THEN NULL
758 -- WHEN (log(10, COL1)) <= 1 THEN 'A'
759 -- WHEN (log(10, COL1)) <= 2 THEN 'B'
760 -- WHEN (log(10, COL1)) <= 3 THEN 'A' END,
761 -- rexp = DECODE("COL1", 'A', '[1; 10), [100; 1000]',
762 -- 'B', '[10; 100)',
763 -- NULL, '( ; 1), (1000; ), NULL')}
764 -- {attr = COL3,
765 -- expr = ln(COL3),
766 -- rexp = exp(COL3)}
767 -- {attr = COL2,
768 -- expr = CASE WHEN "COL2" < 10 THEN NULL
769 -- WHEN "COL2" <= 15 THEN '1'
770 -- WHEN "COL2" <= 20 THEN '2' END
771 -- rexp = DECODE("COL2", '1', '[10; 15)',
772 -- '2', '[15; 20]',
773 -- NULL, '( ; 10) OR (20; ) OR NULL')}
774 -- PARAMETERS
775 -- bin_table_name - xform definition table
776 -- xform_list - stack definition
777 -- literal_flag - literal flag
778 -- bin_schema_name - xform definition table schema name
779 -- RETURNS
780 -- xform_list - updated stack definition
781 -- EXCEPTIONS
782 -- None
783 -- NOTES
784 -- Literal flag indicates whether the values in {bin} are valid SQL
785 -- literals. When the the flag is set to TRUE the value of {bin} is used
786 -- "as is" in the expression generation, otherwise it is converted into a
787 -- valid text literal (surround by quotes and double the quotes inside). By
788 -- default the flag is set to FALSE. One example when it can be set to TRUE
789 -- is in cases when all {bin} are numbers. In that case the xformed column
790 -- will remain numeric as opposed to textual (default behavior). For
791 -- example, for the following xfrom definition:
792 -- COL VAL BIN
793 -- ----- ----- -----
794 -- mycol 10 NULL
795 -- mycol 15 1
796 -- mycol 20 2
797 -- the following {expr} and {rexp} are generated when the flag is FALSE:
798 -- expr = CASE WHEN "mycol" < 10 THEN NULL
799 -- WHEN "mycol" <= 15 THEN '1'
800 -- WHEN "mycol" <= 20 THEN '2' END
801 -- rexp = DECODE("mycol", '1', '[10; 15)',
802 -- '2', '[15; 20]'
803 -- NULL, '( ; 10) OR (20; ) OR NULL')
804 -- and when the flag is set to TRUE:
805 -- expr = CASE WHEN "mycol" < 10 THEN NULL
806 -- WHEN "mycol" <= 15 THEN 1
807 -- WHEN "mycol" <= 20 THEN 2 END
808 -- rexp = DECODE("mycol", 1, '[10; 15)',
809 -- 2, '[15; 20]',
810 -- NULL, '( ; 10) OR (20; ) OR NULL')
811 -----------------------------------------------------------------------------
812 PROCEDURE stack_bin_num(
813 bin_table_name VARCHAR2,
817
814 xform_list IN OUT NOCOPY TRANSFORM_LIST,
815 literal_flag BOOLEAN DEFAULT FALSE,
816 bin_schema_name VARCHAR2 DEFAULT NULL);
818 -----------------------------------------------------------------------------
819 -- create_bin_cat --
820 -----------------------------------------------------------------------------
821 -- NAME
822 -- create_bin_cat - CREATE CATegorical BINning definition table
823 -- DESCRIPTION
824 -- Creates categorical binning definition table:
825 -- CREATE TABLE <bin_cat>(
826 -- col VARCHAR2(30),
827 -- att VARCHAR2(4000),
828 -- val VARCHAR2(4000),
829 -- bin VARCHAR2(4000))
830 -- This table is used to guide query generation process to construct
831 -- categorical binning expressions of the following form:
832 -- DECODE("{col}", {val1}, {bin1},
833 -- ...
834 -- {valN}, {binN},
835 -- NULL, NULL,
836 -- {bin(N+1)}) "{col}"
837 -- This expression maps values {val1}, ..., {valN} into N bins {bin1},...,
838 -- {binN}, and other values into {bin(N+1)}, while NULL values remain
839 -- unchanged. {bin(N+1)} is optional. If it is not specified it defaults
840 -- to NULL. To specify {bin(N+1)} provide a row with {val} = NULL.
841 -- Example 1. <bin_cat> contains four rows with {col} = 'mycol':
842 -- {col = 'mycol', val = 'Waltham', bin = 'MA'}
843 -- {col = 'mycol', val = 'Burlington', bin = 'MA'}
844 -- {col = 'mycol', val = 'Redwood Shores', bin = 'CA'}
845 -- {col = 'mycol', val = NULL, bin = 'OTHER'}
846 -- the following expression is generated:
847 -- DECODE("mycol", 'Waltham', 'MA',
848 -- 'Burlington', 'MA',
849 -- 'Redwood Shores', 'CA',
850 -- NULL, NULL,
851 -- 'OTHER') "mycol"
852 -- Example 2. <bin_cat> contains three rows with {col} = 'mycol':
853 -- {col = 'mycol', val = 'Waltham', bin = 'MA'}
854 -- {col = 'mycol', val = 'Burlington', bin = 'MA'}
855 -- {col = 'mycol', val = 'Redwood Shores', bin = 'CA'}
856 -- the following expression is generated:
857 -- DECODE("mycol", 'Waltham', 'MA',
858 -- 'Burlington', 'MA',
859 -- 'Redwood Shores', 'CA') "mycol"
860 -- PARAMETERS
861 -- bin_table_name - categorical binning definition table
862 -- bin_schema_name - definition table schema name
863 -- RETURNS
864 -- None
865 -- EXCEPTIONS
866 -- None
867 -- NOTES
868 -- Note, that {col} is case sensitive since it generates quoted
869 -- identifiers. In cases when there are multiple entries with the same
870 -- {col}, {val} combiniation with different {bin} the behavior is
871 -- undefined. Any one of the {bin} might be used. The maximum number of
872 -- arguments of a DECODE function is 255.
873 -----------------------------------------------------------------------------
874 PROCEDURE create_bin_cat(
875 bin_table_name VARCHAR2,
876 bin_schema_name VARCHAR2 DEFAULT NULL);
877
878 -----------------------------------------------------------------------------
879 -- insert_bin_cat_freq --
880 -----------------------------------------------------------------------------
881 -- NAME
882 -- insert_bin_cat_freq - INSERT into CATegorical BINning top-FREQuency
883 -- DESCRIPTION
884 -- For every VARCHAR2, CHAR column in the data table that is not in the
885 -- exclusion list finds categorical binning definition and inserts it into
886 -- the definition table. Definition for each relevant column is computed
887 -- based on occurence frequency of column values that are computed from
888 -- the data table reference. Each of the N (bin_num) bins {bin1}, ...,
889 -- {binN} correspond to the values with top frequencies when N > 0 or
890 -- bottom frequencies when N < 0, and {bin(N+1)} to all remaining
891 -- values, where {binI} = I. Ordering ties among identical frequencies are
892 -- broken by ordering on column values (ASC for N > 0 or DESC for N < 0).
893 -- When the the number of distinct values C < N only C+1 bins will be
894 -- created. Parameter default_num (D) is used for prunning based on the
895 -- number of values that fall in the default bin. When D > 0 only columns
896 -- that have at least D defaults are kept while others are ignored. When
897 -- D < 0 only columns that have at most D values are kept. No prunning is
898 -- done when D is NULL or when D = 0. Parameter bin_support (SUP) is used
899 -- for restricting bins to frequent (SUP > 0) values frq >= SUP*tot, or
900 -- infrequent (SUP < 0) ones frq <= (-SUP)*tot, where frq is a given value
901 -- count and tot is a sum of all counts as computed from the data. No
902 -- support filtering is done when SUP is NULL or when SUP = 0.
903 -- PARAMETERS
904 -- bin_table_name - categorical binning definition table
905 -- data_table_name - data table
906 -- bin_num - number of bins
907 -- exclude_list - column exclusion list
908 -- default_num - number of default values
909 -- bin_support - bin support (fraction)
913 -- None
910 -- bin_schema_name - definition table schema name
911 -- data_schema_name - data table schema name
912 -- RETURNS
914 -- EXCEPTIONS
915 -- None
916 -- NOTES
917 -- Nothing is done when bin_num IS NULL or bin_num = 0. NULL values
918 -- are not counted. Columns with all NULLs are ignored.
919 -----------------------------------------------------------------------------
920 PROCEDURE insert_bin_cat_freq(
921 bin_table_name VARCHAR2,
922 data_table_name VARCHAR2,
923 bin_num PLS_INTEGER DEFAULT 9,
924 exclude_list COLUMN_LIST DEFAULT NULL,
925 default_num PLS_INTEGER DEFAULT 2,
926 bin_support NUMBER DEFAULT NULL,
927 bin_schema_name VARCHAR2 DEFAULT NULL,
928 data_schema_name VARCHAR2 DEFAULT NULL);
929
930 -----------------------------------------------------------------------------
931 -- xform_bin_cat --
932 -----------------------------------------------------------------------------
933 -- NAME
934 -- xform_bin_cat - XFORMation CATegorical BINning
935 -- DESCRIPTION
936 -- Creates a view that perfoms categorical binning of the data table. Only
937 -- the columns that are specified in the xform definition are binned, the
938 -- remaining columns do not change.
939 -- PARAMETERS
940 -- bin_table_name - xform definition table
941 -- data_table_name - data table
942 -- xform_view_name - xform view
943 -- litiral_flag - literal flag
944 -- bin_schema_name - xform definition table schema name
945 -- data_schema_name - data table schema name
946 -- xform_schema_name - xform view schema name
947 -- RETURNS
948 -- None
949 -- EXCEPTIONS
950 -- None
951 -- NOTES
952 -- Literal flag indicates whether the values in {bin} are valid SQL
953 -- literals. When the the flag is set to TRUE the value of {bin} is used
954 -- as is in query generation, otherwise it is converted into a valid text
955 -- literal (surround by quotes and double the quotes inside). By default
956 -- the flag is set to FALSE. One example when it can be set to TRUE is in
957 -- cases when all {bin} are numbers. In that case the xformed column will
958 -- be numeric as opposed to textual (default behavior). For example,
959 -- for the following xfrom definition:
960 -- COL VAL BIN
961 -- ----- -------------- ----
962 -- mycol Waltham 1
963 -- mycol Burlington 1
964 -- mycol Redwood Shores 2
965 -- the following expression is generated when the flag is set to FALSE:
966 -- DECODE("mycol", 'Waltham', '1',
967 -- 'Burlington', '1',
968 -- 'Redwood Shores','2') "mycol"
969 -- and when the flag is set to TRUE:
970 -- DECODE("mycol", 'Waltham', 1,
971 -- 'Burlington', 1,
972 -- 'Redwood Shores', 2) "mycol"
973 -----------------------------------------------------------------------------
974 PROCEDURE xform_bin_cat(
975 bin_table_name VARCHAR2,
976 data_table_name VARCHAR2,
977 xform_view_name VARCHAR2,
978 literal_flag BOOLEAN DEFAULT FALSE,
979 bin_schema_name VARCHAR2 DEFAULT NULL,
980 data_schema_name VARCHAR2 DEFAULT NULL,
981 xform_schema_name VARCHAR2 DEFAULT NULL);
982
983 -----------------------------------------------------------------------------
984 -- stack_bin_cat --
985 -----------------------------------------------------------------------------
986 -- NAME
987 -- stack_bin_cat - STACK CATegorical BINning
988 -- DESCRIPTION
989 -- For every column in the stack definition that has a matching entry in
990 -- the transformation definition, stacks categorical binning expression on
991 -- top of the existing expression and updates the stack definition.
992 -- Columns in the transformation definition that have no matching entries
993 -- in the stack definition are added to the stack definition (using "{col}"
994 -- in place of the original expression or "VALUE" for nested attributes).
995 -- Reverse expressions in the stack definition are updated accordingly,
996 -- that is if an expression is updated, added or remains unchanged then a
997 -- corresponding reverse expression is also updated, added or remains
998 -- unchanged. When reverse expression is NULL it is treated as identity
999 -- function ("{attr}" or "VALUE" for nested attributes).
1000 -- Given an entry ({attr}, {expr}, {rexp}) in the stack definition and a
1001 -- matching entry in the transformation definition, after stacking {expr}
1002 -- has the following form:
1003 -- DECODE({expr}, '{val1}', '{bin1}',
1004 -- ...
1005 -- '{valN}', '{binN}',
1006 -- NULL, NULL,
1007 -- '{bin(N+1)}')
1011 -- '{binN}', '{revN}',
1008 -- and {rexp} has the following form:
1009 -- DECODE("{attr}", '{bin1}', '{rev1}',
1010 -- ...
1012 -- NULL, 'NULL',
1013 -- '{bin(N+1)}', 'DEFAULT')
1014 -- where {revI} is the result of applying reverese expression to {valI}.
1015 -- If {binI} and {binJ} are equal then the corresponding entries of the
1016 -- DECODE function above are merged into:
1017 -- '{binI}', '{revI}, {revJ}'
1018 -- If more than one entry maps to the default bin {bin(N+1)} they are all
1019 -- merged to
1020 -- '{bin(N+1)}', 'DEFAULT'
1021 -- Note that reverse expression implicitly maps invalid bins to NULL.
1022 -- Example 1. Given transformation definition:
1023 -- {col = COL1, val = waltham, bin = MA}
1024 -- {col = COL1, val = burlington, bin = MA}
1025 -- {col = COL1, val = redwood shores, bin = CA}
1026 -- {col = COL2, val = MA, bin = East}
1027 -- {col = COL2, val = CA, bin = West}
1028 -- {col = COL2, val = NULL, bin = USA}
1029 -- and stack definition:
1030 -- {attr = COL1, expr = lower(COL1), rexp = initcap(COL1)}
1031 -- {attr = COL3, expr = upper(COL3), rexp = initcap(COL3)}
1032 -- the following updated stack definition is generated:
1033 -- {attr = COL1,
1034 -- expr = DECODE(lower(COL1), 'waltham', 'MA',
1035 -- 'burlington', 'MA',
1036 -- 'redwood shores', 'CA'),
1037 -- rexp = DECODE("COL1", 'MA', '''Waltham'', ''Burlington''',
1038 -- 'CA', '''Redwood Shores''',
1039 -- NULL, 'DEFAULT')}
1040 -- {attr = COL3,
1041 -- expr = upper(COL3),
1042 -- rexp = initcap(COL3)}
1043 -- {attr = COL2,
1044 -- expr = DECODE("COL2", 'MA', 'East',
1045 -- 'NY', 'East',
1046 -- 'CA', 'West',
1047 -- NULL, NULL,
1048 -- 'USA')
1049 -- rexp = DECODE("COL2", 'East', '''MA''',
1050 -- 'West', '''CA''',
1051 -- NULL, 'NULL',
1052 -- 'USA', 'DEFAULT')}
1053 -- PARAMETERS
1054 -- bin_table_name - xform definition table
1055 -- xform_list - stack definition
1056 -- literal_flag - literal flag
1057 -- bin_schema_name - xform definition table schema name
1058 -- RETURNS
1059 -- xform_list - updated stack definition
1060 -- EXCEPTIONS
1061 -- None
1062 -- NOTES
1063 -- Literal flag indicates whether the values in {bin} are valid SQL
1064 -- literals. When the the flag is set to TRUE the value of {bin} is used
1065 -- "as is" in the expression generation, otherwise it is converted into a
1066 -- valid text literal (surround by quotes and double the quotes inside). By
1067 -- default the flag is set to FALSE. One example when it can be set to TRUE
1068 -- is in cases when all {bin} are numbers. In that case the xformed column
1069 -- will remain numeric as opposed to textual (default behavior). For
1070 -- example, for the following xfrom definition:
1071 -- COL VAL BIN
1072 -- ----- -------------- ----
1073 -- mycol Waltham 1
1074 -- mycol Burlington 1
1075 -- mycol Redwood Shores 2
1076 -- the following {expr} and {rexp} are generated when the flag is FALSE:
1077 -- expr = DECODE("mycol", 'Waltham', '1',
1078 -- 'Burlington', '1',
1079 -- 'Redwood Shores', '2')
1080 -- rexp = DECODE("COL1", '1', '''Waltham'', ''Burlington''',
1081 -- '2', '''Redwood Shores''',
1082 -- NULL, 'DEFAULT')
1083 -- and when the flag is set to TRUE:
1084 -- expr = DECODE("mycol", 'Waltham', 1,
1085 -- 'Burlington', 1,
1086 -- 'Redwood Shores', 2)
1087 -- rexp = DECODE("COL1", 1, '''Waltham'', ''Burlington''',
1088 -- 2, '''Redwood Shores''',
1089 -- NULL, 'DEFAULT')
1090 -----------------------------------------------------------------------------
1091 PROCEDURE stack_bin_cat(
1092 bin_table_name VARCHAR2,
1093 xform_list IN OUT NOCOPY TRANSFORM_LIST,
1094 literal_flag BOOLEAN DEFAULT FALSE,
1095 bin_schema_name VARCHAR2 DEFAULT NULL);
1096
1097 -----------------------------------------------------------------------------
1098 -- insert_bin_super --
1099 -----------------------------------------------------------------------------
1100 -- NAME
1101 -- insert_bin_super - INSERT into BINning SUPERvised
1102 -- DESCRIPTION
1103 -- For every NUMBER, VARCHAR2, or CHAR, as well as DM_NESTED_NUMERICAL, or
1104 -- DM_NESTED_CATEGORICAL column in the data table that is not in the
1105 -- exclusion list finds numerical and categorical binning definition
1106 -- and inserts them into the corresponding definition tables. Definition
1107 -- for each relevant column is computed based on the splits found by the
1111 -- num_table_name - numerical binning definition table
1108 -- Decision Tree model build on a single predictor. Columns that have no
1109 -- splits are inserted into the column removal definition table.
1110 -- PARAMETERS
1112 -- cat_table_name - categorical binning definition table
1113 -- data_table_name - data table
1114 -- target_column_name - target column
1115 -- max_bin_num - maximum number of bins
1116 -- exclude_list - column exclusion list
1117 -- num_schema_name - numerical definition table schema
1118 -- cat_schema_name - categorical definition table schema
1119 -- data_schema_name - data table schema name
1120 -- rem_table_name - column removal definition table
1121 -- rem_schema_name - removal definition table schema
1122 -- RETURNS
1123 -- None
1124 -- EXCEPTIONS
1125 -- None
1126 -- NOTES
1127 -- Column removal definition table is optional.
1128 -----------------------------------------------------------------------------
1129 PROCEDURE insert_bin_super(
1130 num_table_name VARCHAR2,
1131 cat_table_name VARCHAR2,
1132 data_table_name VARCHAR2,
1133 target_column_name VARCHAR2,
1134 max_bin_num PLS_INTEGER DEFAULT 1000,
1135 exclude_list COLUMN_LIST DEFAULT NULL,
1136 num_schema_name VARCHAR2 DEFAULT NULL,
1137 cat_schema_name VARCHAR2 DEFAULT NULL,
1138 data_schema_name VARCHAR2 DEFAULT NULL,
1139 rem_table_name VARCHAR2 DEFAULT NULL,
1140 rem_schema_name VARCHAR2 DEFAULT NULL);
1141
1142 -----------------------------------------------------------------------------
1143 -- xform_expr_num --
1144 -----------------------------------------------------------------------------
1145 -- NAME
1146 -- xform_expr_num - XFORMation EXPRession NUMber
1147 -- DESCRIPTION
1148 -- Creates a view that applies a given expression for every NUMBER column
1149 -- in the data table that is not in the exclusion list and in the
1150 -- inclusion list. The remaining columns do not change. Expressions are
1151 -- constructed from the expression pattern by replacing every occurance of
1152 -- the column pattern with an actual column name.
1153 -- Example 1. For a table TAB with two NUMBER columns CN1, CN3 and one
1154 -- CHAR columns CC2 and expression pattern TO_CHAR(:col) the following
1155 -- query is generated:
1156 -- SELECT TO_CHAR("CN1") "CN1", "CC2", TO_CHAR("CN3") "CN3"
1157 -- FROM TAB
1158 -- Example 2. This procedure can be used for clipping (winsorizing)
1159 -- normalized data to a [0..1] range, that is values x > 1 become 1 and
1160 -- values x < 0 become 0. For the table in example 1 and pattern
1161 -- CASE WHEN :col < 0 THEN 0 WHEN :col > 1 THEN 1 ELSE :col END
1162 -- the following query is generated:
1163 -- SELECT CASE WHEN "CN1" < 0 THEN 0 WHEN "CN1" > 1 THEN 1
1164 -- ELSE "CN1" END "CN1",
1165 -- "CC2",
1166 -- CASE WHEN "CN3" < 0 THEN 0 WHEN "CN3" > 1 THEN 1
1167 -- ELSE "CN3" END "CN3"
1168 -- FROM TAB
1169 -- PARAMETERS
1170 -- expr_pattern - expression pattern
1171 -- data_table_name - data table
1172 -- xform_view_name - xform view
1173 -- exclude_list - column exclusion list
1174 -- include_list - column inclusion list
1175 -- col_pattern - column pattern
1176 -- data_schema_name - data table schema name
1177 -- xform_schema_name - xform view schema name
1178 -- RETURNS
1179 -- None
1180 -- EXCEPTIONS
1181 -- None
1182 -- NOTES
1183 -- The default value of column pattern is ':col'. Column pattern is case
1184 -- sensetive. Expressions are constructed using SQL REPLACE function:
1185 -- REPALCE(expr_pattern, col_pattern, '"<column>"')||' "<column>"'
1186 -- NULL exclusion list is treated as an empty set (exclude none) and NULL
1187 -- inclusion list is treated as a full set (include all).
1188 -----------------------------------------------------------------------------
1189 PROCEDURE xform_expr_num(
1190 expr_pattern VARCHAR2,
1191 data_table_name VARCHAR2,
1192 xform_view_name VARCHAR2,
1193 exclude_list COLUMN_LIST DEFAULT NULL,
1194 include_list COLUMN_LIST DEFAULT NULL,
1195 col_pattern VARCHAR2 DEFAULT ':col',
1196 data_schema_name VARCHAR2 DEFAULT NULL,
1197 xform_schema_name VARCHAR2 DEFAULT NULL);
1198
1199 -----------------------------------------------------------------------------
1200 -- xform_expr_str --
1201 -----------------------------------------------------------------------------
1202 -- NAME
1203 -- xform_expr_str - XFORMation EXPRession STRing
1204 -- DESCRIPTION
1205 -- Similar to xform_expr_num, except that it applies to CHAR and VARCHAR2
1206 -- columns instead of NUMBER.
1207 -- PARAMETERS
1208 -- expr_pattern - expression pattern
1209 -- data_table_name - data table
1210 -- xform_view_name - xform view
1214 -- data_schema_name - data table schema name
1211 -- exclude_list - column exclusion list
1212 -- include_list - column inclusion list
1213 -- col_pattern - column pattern
1215 -- xform_schema_name - xform view schema name
1216 -- RETURNS
1217 -- None
1218 -- EXCEPTIONS
1219 -- None
1220 -- NOTES
1221 -- None
1222 -----------------------------------------------------------------------------
1223 PROCEDURE xform_expr_str(
1224 expr_pattern VARCHAR2,
1225 data_table_name VARCHAR2,
1226 xform_view_name VARCHAR2,
1227 exclude_list COLUMN_LIST DEFAULT NULL,
1228 include_list COLUMN_LIST DEFAULT NULL,
1229 col_pattern VARCHAR2 DEFAULT ':col',
1230 data_schema_name VARCHAR2 DEFAULT NULL,
1231 xform_schema_name VARCHAR2 DEFAULT NULL);
1232
1233 -----------------------------------------------------------------------------
1234 -- create_clip --
1235 -----------------------------------------------------------------------------
1236 -- NAME
1237 -- create_clip - CREATE CLIPping definition table
1238 -- DESCRIPTION
1239 -- Creates clippping definition table:
1240 -- CREATE TABLE <clip>(
1241 -- col VARCHAR2(30),
1242 -- att VARCHAR2(4000),
1243 -- lcut NUMBER,
1244 -- lval NUMBER,
1245 -- rcut NUMBER,
1246 -- rval NUMBER)
1247 -- This table is used to guide query generation process to construct
1248 -- clipping expressions of the following form:
1249 -- CASE WHEN "{col}" < {lcut} THEN {lval}
1250 -- WHEN "{col}" > {rcut} THEN {rval}
1251 -- ELSE "{col}"
1252 -- END "{col}"
1253 -- Example 1. (winsorizing) When col = 'my_col', lcut = -1.5, lval = -1.5,
1254 -- and rcut = 4.5 and rval = 4.5 the following expression is generated:
1255 -- CASE WHEN "my_col" < -1.5 THEN -1.5
1256 -- WHEN "my_col" > 4.5 THEN 4.5
1257 -- ELSE "my_col"
1258 -- END "my_col"
1259 -- Example 2. (trimming) When col = 'my_col', lcut = -1.5, lval = NULL,
1260 -- and rcut = 4.5 and rval = NULL the following expression is generated:
1261 -- CASE WHEN "my_col" < -1.5 THEN NULL
1262 -- WHEN "my_col" > 4.5 THEN NULL
1263 -- ELSE "my_col"
1264 -- END "my_col"
1265 -- PARAMETERS
1266 -- clip_table_name - clipping definition table
1267 -- clip_schema_name - clipping definition table schema name
1268 -- RETURNS
1269 -- None
1270 -- EXCEPTIONS
1271 -- None
1272 -- NOTES
1273 -- Note, that {col} is case sensitive since it generates quoted
1274 -- identifiers. When there are multiple entries in the xform defintion
1275 -- table for the same {col} the behavior is undefined. Any one of the
1276 -- definitions may be used in query generation. NULL values remain
1277 -- unchanged.
1278 -----------------------------------------------------------------------------
1279 PROCEDURE create_clip(
1280 clip_table_name VARCHAR2,
1281 clip_schema_name VARCHAR2 DEFAULT NULL);
1282
1283 -----------------------------------------------------------------------------
1284 -- insert_clip_winsor_tail --
1285 -----------------------------------------------------------------------------
1286 -- NAME
1287 -- insert_clip_winsor_tail - INSERT into CLIPping WINSORizing TAIL
1288 -- DESCRIPTION
1289 -- For every NUMBER column in the data table that is not in the exclusion
1290 -- list finds clipping definition and inserts it into the definition
1291 -- table. Definition for each relevant column is computed based on the
1292 -- non-NULL values sorted in ascending order such that val(1)<val(2)<...<
1293 -- val(N), where N is a total number of non-NULL values in a column:
1294 -- lcut = val(1+floor(N*q))
1295 -- lval = lcut
1296 -- rcut = val(N-floor(N*q))
1297 -- rval = rcut
1298 -- where q = ABS(NVL(tail_frac,0)). Nothing is done when q >= 0.5.
1299 -- PARAMETERS
1300 -- clip_table_name - clipping definition table
1301 -- data_table_name - data table
1302 -- tail_frac - tail fraction
1303 -- exclude_list - column exclusion list
1304 -- clip_schema_name - definition table schema name
1305 -- data_schema_name - data table schema name
1306 -- RETURNS
1307 -- None
1308 -- EXCEPTIONS
1309 -- None
1310 -- NOTES
1311 -- Columns with all NULLs are ignored.
1312 -----------------------------------------------------------------------------
1313 PROCEDURE insert_clip_winsor_tail(
1314 clip_table_name VARCHAR2,
1315 data_table_name VARCHAR2,
1316 tail_frac NUMBER DEFAULT 0.025,
1317 exclude_list COLUMN_LIST DEFAULT NULL,
1318 clip_schema_name VARCHAR2 DEFAULT NULL,
1319 data_schema_name VARCHAR2 DEFAULT NULL);
1320
1321 -----------------------------------------------------------------------------
1322 -- insert_clip_trim_tail --
1326 -- DESCRIPTION
1323 -----------------------------------------------------------------------------
1324 -- NAME
1325 -- insert_clip_trim_tail - INSERT into CLIPping TRIMming TAIL
1327 -- For every NUMBER column in the data table that is not in the exclusion
1328 -- list finds clipping definition and inserts it into the definition
1329 -- table. Definition for each relevant column is computed based on the
1330 -- non-NULL values sorted in ascending order such that val(1)<val(2)<...<
1331 -- val(N), where N is a total number of non-NULL values in a column:
1332 -- lcut = val(1+floor(N*q))
1333 -- lval = NULL
1334 -- rcut = val(N-floor(N*q))
1335 -- rval = NULL
1336 -- where q = ABS(NVL(tail_frac,0)). Nothing is done when q >= 0.5.
1337 -- PARAMETERS
1338 -- clip_table_name - clipping definition table
1339 -- data_table_name - data table
1340 -- tail_frac - tail fraction
1341 -- exclude_list - column exclusion list
1342 -- clip_schema_name - definition table schema name
1343 -- data_schema_name - data table schema name
1344 -- RETURNS
1345 -- None
1346 -- EXCEPTIONS
1347 -- None
1348 -- NOTES
1349 -- Columns with all NULLs are ignored.
1350 -----------------------------------------------------------------------------
1351 PROCEDURE insert_clip_trim_tail(
1352 clip_table_name VARCHAR2,
1353 data_table_name VARCHAR2,
1354 tail_frac NUMBER DEFAULT 0.025,
1355 exclude_list COLUMN_LIST DEFAULT NULL,
1356 clip_schema_name VARCHAR2 DEFAULT NULL,
1357 data_schema_name VARCHAR2 DEFAULT NULL);
1358
1359 -----------------------------------------------------------------------------
1360 -- xform_clip --
1361 -----------------------------------------------------------------------------
1362 -- NAME
1363 -- xform_clip - XFORMation CLIPping
1364 -- DESCRIPTION
1365 -- Creates a view that perfoms clipping of the data table. Only the
1366 -- columns that are specified in the xform definition are clipped, the
1367 -- remaining columns do not change.
1368 -- PARAMETERS
1369 -- clip_table_name - xform definition table
1370 -- data_table_name - data table
1371 -- xform_view_name - xform view
1372 -- clip_schema_name - xform definition table schema name
1373 -- data_schema_name - data table schema name
1374 -- xform_schema_name - xform view schema name
1375 -- RETURNS
1376 -- None
1377 -- EXCEPTIONS
1378 -- None
1379 -- NOTES
1380 -- None
1381 -----------------------------------------------------------------------------
1382 PROCEDURE xform_clip(
1383 clip_table_name VARCHAR2,
1384 data_table_name VARCHAR2,
1385 xform_view_name VARCHAR2,
1386 clip_schema_name VARCHAR2 DEFAULT NULL,
1387 data_schema_name VARCHAR2 DEFAULT NULL,
1388 xform_schema_name VARCHAR2 DEFAULT NULL);
1389
1390 -----------------------------------------------------------------------------
1391 -- stack_clip --
1392 -----------------------------------------------------------------------------
1393 -- NAME
1394 -- stack_clip - STACK CLIPping
1395 -- DESCRIPTION
1396 -- For every column in the stack definition that has a matching entry in
1397 -- the transformation definition, stacks clipping expression on top of the
1398 -- existing expression and updates the stack definition. Columns in the
1399 -- stack definition that have no matching entries in the transformation
1400 -- definition or have NULL expression are not changed. Columns in the
1401 -- transformation definition that have no matching entries in the stack
1402 -- definition are added to the stack definition (using "{col}" in place of
1403 -- the original expression or "VALUE" for nested attributes).
1404 -- Reverse expressions in the stack definition are updated accordingly,
1405 -- that is if an expression is updated, added or remains unchanged then a
1406 -- corresponding reverse expression is also updated, added or remains
1407 -- unchanged. When reverse expression is NULL it is treated as identity
1408 -- function ("{attr}" or "VALUE" for nested attributes).
1409 -- Given an entry ({attr}, {expr}, {rexp}) in the stack definition and a
1410 -- matching entry in the transformation definition, after stacking {expr}
1411 -- has the following form:
1412 -- CASE WHEN {expr} < {lcut} THEN {lval}
1413 -- WHEN {expr} > {rcut} THEN {rval}
1414 -- ELSE {expr}
1415 -- END
1416 -- and {rexp} remains unchanged
1417 -- Example 1. Given transformation definition:
1418 -- {col = COL1, lcut = -1.5, lval = -1.5, rcut = 4.5, rval = 4.5}
1419 -- {col = COL2, lcut = 0, lval = 0, rcut = 1, rval = 1}
1420 -- and stack definition:
1421 -- {attr = COL1, expr = log(10, COL1), rexp = power(10, COL1)}
1422 -- {attr = COL3, expr = ln(COL3), rexp = exp(COL3)}
1423 -- the following updated stack definition is generated:
1424 -- {attr = COL1,
1425 -- expr = CASE WHEN log(10, COL1) < -1.5 THEN -1.5
1426 -- WHEN log(10, COL1) > 4.5 THEN 4.5
1430 -- {attr = COL3,
1427 -- ELSE log(10, COL1)
1428 -- END,
1429 -- rexp = power(10, COL1)}
1431 -- expr = ln(COL3),
1432 -- rexp = exp(COL3)}
1433 -- {attr = COL2,
1434 -- expr = CASE WHEN COL2 < 0 THEN 0
1435 -- WHEN COL2 > 1 THEN 1
1436 -- ELSE COL2
1437 -- END,
1438 -- rexp = NULL}
1439 -- PARAMETERS
1440 -- clip_table_name - xform definition table
1441 -- xform_list - stack definition
1442 -- clip_schema_name - xform definition table schema name
1443 -- RETURNS
1444 -- xform_list - updated stack definition
1445 -- EXCEPTIONS
1446 -- None
1447 -- NOTES
1448 -- None
1449 -----------------------------------------------------------------------------
1450 PROCEDURE stack_clip(
1451 clip_table_name VARCHAR2,
1452 xform_list IN OUT NOCOPY TRANSFORM_LIST,
1453 clip_schema_name VARCHAR2 DEFAULT NULL);
1454
1455 -----------------------------------------------------------------------------
1456 -- create_miss_num --
1457 -----------------------------------------------------------------------------
1458 -- NAME
1459 -- create_miss_num - CREATE NUMerical MISSing value treatment definition
1460 -- table
1461 -- DESCRIPTION
1462 -- Creates numerical missing value treatment definition table:
1463 -- CREATE TABLE <miss_num>(
1464 -- col VARCHAR2(30),
1465 -- att VARCHAR2(4000),
1466 -- val NUMBER)
1467 -- This table is used to guide query generation process to construct
1468 -- missing value treatment expressions of the following form:
1469 -- NVL("{col}", {val}) "{col}"
1470 -- For example when col = 'my_col', val = 20 the
1471 -- following expression is generated:
1472 -- NVL("my_col", 20) "my_col"
1473 -- PARAMETERS
1474 -- miss_table_name - definition table
1475 -- miss_schema_name - definition table schema name
1476 -- RETURNS
1477 -- None
1478 -- EXCEPTIONS
1479 -- None
1480 -- NOTES
1481 -- Note, that {col} is case sensitive since it generates quoted
1482 -- identifiers. When there are multiple entries in the xform defintion
1483 -- table for the same {col} the behavior is undefined. Any one of the
1484 -- definitions may be used in query generation.
1485 -----------------------------------------------------------------------------
1486 PROCEDURE create_miss_num(
1487 miss_table_name VARCHAR2,
1488 miss_schema_name VARCHAR2 DEFAULT NULL);
1489
1490 -----------------------------------------------------------------------------
1491 -- insert_miss_num_mean --
1492 -----------------------------------------------------------------------------
1493 -- NAME
1494 -- insert_miss_num_mean - INSERT into NUMerical MISSining value treatment
1495 -- MEAN
1496 -- DESCRIPTION
1497 -- For every NUMBER column in the data table that is not in the exclusion
1498 -- list finds missing value treatment definition and inserts it into the
1499 -- definition table. Definition for each relevant column is computed based
1500 -- on the mean (average) value that is computed from the data table:
1501 -- val = mean
1502 -- The value of mean is rounded to round_num significant digits prior to
1503 -- storing it in the definition table.
1504 -- PARAMETERS
1505 -- miss_table_name - definition table
1506 -- data_table_name - data table
1507 -- exclude_list - column exclusion list
1508 -- round_num - number of significant digits
1509 -- miss_schema_name - definition table schema name
1510 -- data_schema_name - data table schema name
1511 -- RETURNS
1512 -- None
1513 -- EXCEPTIONS
1514 -- None
1515 -- NOTES
1516 -- Columns with all NULLs are ignored.
1517 -----------------------------------------------------------------------------
1518 PROCEDURE insert_miss_num_mean(
1519 miss_table_name VARCHAR2,
1520 data_table_name VARCHAR2,
1521 exclude_list COLUMN_LIST DEFAULT NULL,
1522 round_num PLS_INTEGER DEFAULT 6,
1523 miss_schema_name VARCHAR2 DEFAULT NULL,
1524 data_schema_name VARCHAR2 DEFAULT NULL);
1525
1526 -----------------------------------------------------------------------------
1527 -- xform_miss_num --
1528 -----------------------------------------------------------------------------
1529 -- NAME
1530 -- xform_miss_num - XFORMation NUMerical MISSing value treatment
1531 -- DESCRIPTION
1532 -- Creates a view that perfoms numerical missing value treatment of the
1533 -- data table. Only the columns that are specified in the xform definition
1534 -- are treated, the remaining columns do not change.
1535 -- PARAMETERS
1536 -- miss_table_name - xform definition table
1537 -- data_table_name - data table
1538 -- xform_view_name - xform view
1542 -- RETURNS
1539 -- miss_schema_name - xform definition table schema name
1540 -- data_schema_name - data table schema name
1541 -- xform_schema_name - xform view schema name
1543 -- None
1544 -- EXCEPTIONS
1545 -- None
1546 -- NOTES
1547 -- None
1548 -----------------------------------------------------------------------------
1549 PROCEDURE xform_miss_num(
1550 miss_table_name VARCHAR2,
1551 data_table_name VARCHAR2,
1552 xform_view_name VARCHAR2,
1553 miss_schema_name VARCHAR2 DEFAULT NULL,
1554 data_schema_name VARCHAR2 DEFAULT NULL,
1555 xform_schema_name VARCHAR2 DEFAULT NULL);
1556
1557 -----------------------------------------------------------------------------
1558 -- stack_miss_num --
1559 -----------------------------------------------------------------------------
1560 -- NAME
1561 -- stack_miss_num - STACK NUMerical MISSing value treatment
1562 -- DESCRIPTION
1563 -- For every column in the stack definition that has a matching entry in
1564 -- the transformation definition, stacks numerical missing value treatment
1565 -- expression on top of the existing expression and updates the stack
1566 -- definition. Columns in the stack definition that have no matching
1567 -- entries in the transformation definition or have NULL expression are
1568 -- not changed. Columns in the transformation definition that have no
1569 -- matching entries in the stack definition are added to the stack
1570 -- definition (using "{col}" in place of the original expression or "VALUE"
1571 -- for nested attributes). Reverse expressions in the stack definition are
1572 -- updated accordingly, that is if an expression is updated, added or
1573 -- remains unchanged then a corresponding reverse expression is also
1574 -- updated, added or remains unchanged. When reverse expression is NULL it
1575 -- is treated as identity function ("{attr}" or "VALUE" for nested
1576 -- attributes). Given an entry ({attr}, {expr}, {rexp}) in the stack
1577 -- definition and a matching entry in the transformation definition, after
1578 -- stacking {expr} has the following form:
1579 -- NVL({expr}, {val})
1580 -- and {rexp} remains unchanged
1581 -- Example 1. Given transformation definition:
1582 -- {col = COL1, val = 4.5}
1583 -- {col = COL2, val = 0}
1584 -- and stack definition:
1585 -- {attr = COL1, expr = log(10, COL1), rexp = power(10, COL1)}
1586 -- {attr = COL3, expr = ln(COL3), rexp = exp(COL3)}
1587 -- the following updated stack definition is generated:
1588 -- {attr = COL1,
1589 -- expr = NVL(log(10, COL1), 4.5),
1590 -- rexp = power(10, COL1)}
1591 -- {attr = COL3,
1592 -- expr = ln(COL3),
1593 -- rexp = exp(COL3)}
1594 -- {attr = COL2,
1595 -- expr = NVL(COL2, 0),
1596 -- rexp = NULL}
1597 -- PARAMETERS
1598 -- miss_table_name - xform definition table
1599 -- xform_list - stack definition
1600 -- miss_schema_name - xform definition table schema name
1601 -- RETURNS
1602 -- xform_list - updated stack definition
1603 -- EXCEPTIONS
1604 -- None
1605 -- NOTES
1606 -- None
1607 -----------------------------------------------------------------------------
1608 PROCEDURE stack_miss_num(
1609 miss_table_name VARCHAR2,
1610 xform_list IN OUT NOCOPY TRANSFORM_LIST,
1611 miss_schema_name VARCHAR2 DEFAULT NULL);
1612
1613 -----------------------------------------------------------------------------
1614 -- create_miss_cat --
1615 -----------------------------------------------------------------------------
1616 -- NAME
1617 -- create_miss_cat - CREATE CATegorical MISSing value treatment definition
1618 -- table
1619 -- DESCRIPTION
1620 -- Creates categorical missing value treatment definition table:
1621 -- CREATE TABLE <miss_cat>(
1622 -- col VARCHAR2(30),
1623 -- att VARCHAR2(4000),
1624 -- val VARCHAR2(4000))
1625 -- This table is used to guide query generation process to construct
1626 -- missing value treatment expressions of the following form:
1627 -- NVL("{col}", {val}) "{col}"
1628 -- For example when col = 'zip_code', val = 'MA' the
1629 -- following expression is generated:
1630 -- NVL("zip_code", 'MA') "zip_code"
1631 -- PARAMETERS
1632 -- miss_table_name - definition table
1633 -- miss_schema_name - definition table schema name
1634 -- RETURNS
1635 -- None
1636 -- EXCEPTIONS
1637 -- None
1638 -- NOTES
1639 -- Note, that {col} is case sensitive since it generates quoted
1640 -- identifiers. When there are multiple entries in the xform defintion
1641 -- table for the same {col} the behavior is undefined. Any one of the
1642 -- definitions may be used in query generation.
1643 -----------------------------------------------------------------------------
1644 PROCEDURE create_miss_cat(
1645 miss_table_name VARCHAR2,
1646 miss_schema_name VARCHAR2 DEFAULT NULL);
1647
1648 -----------------------------------------------------------------------------
1649 -- insert_miss_cat_mode --
1653 -- treatment MODE
1650 -----------------------------------------------------------------------------
1651 -- NAME
1652 -- insert_miss_cat_mode - INSERT into CATegorical MISSining value
1654 -- DESCRIPTION
1655 -- For every VARCHAR2, CHAR column in the data table that is not in the
1656 -- exclusion list finds missing value treatment definition and inserts it
1657 -- into the definition table. Definition for each relevant column is
1658 -- computed based on the mode value that is computed from the data table:
1659 -- val = mode
1660 -- PARAMETERS
1661 -- miss_table_name - definition table
1662 -- data_table_name - data table
1663 -- exclude_list - column exclusion list
1664 -- miss_schema_name - definition table schema name
1665 -- data_schema_name - data table schema name
1666 -- RETURNS
1667 -- None
1668 -- EXCEPTIONS
1669 -- None
1670 -- NOTES
1671 -- Columns with all NULLs are ignored.
1672 -----------------------------------------------------------------------------
1673 PROCEDURE insert_miss_cat_mode(
1674 miss_table_name VARCHAR2,
1675 data_table_name VARCHAR2,
1676 exclude_list COLUMN_LIST DEFAULT NULL,
1677 miss_schema_name VARCHAR2 DEFAULT NULL,
1678 data_schema_name VARCHAR2 DEFAULT NULL);
1679
1680 -----------------------------------------------------------------------------
1681 -- xform_miss_cat --
1682 -----------------------------------------------------------------------------
1683 -- NAME
1684 -- xform_miss_cat - XFORMation CATegorical MISSing value treatment
1685 -- DESCRIPTION
1686 -- Creates a view that perfoms categorical missing value treatment of the
1687 -- data table. Only the columns that are specified in the xform definition
1688 -- are treated, the remaining columns do not change.
1689 -- PARAMETERS
1690 -- miss_table_name - xform definition table
1691 -- data_table_name - data table
1692 -- xform_view_name - xform view
1693 -- miss_schema_name - xform definition table schema name
1694 -- data_schema_name - data table schema name
1695 -- xform_schema_name - xform view schema name
1696 -- RETURNS
1697 -- None
1698 -- EXCEPTIONS
1699 -- None
1700 -- NOTES
1701 -- The data type of the xformed columns is preseved by putting a CAST
1702 -- expression around NVL. For example, when col = 'zip_code', val = 'MA'
1703 -- the data type is CHAR(2) the following expression is generated:
1704 -- CAST(NVL("zip_code", 'MA') AS CHAR(2)) "zip_code"
1705 -----------------------------------------------------------------------------
1706 PROCEDURE xform_miss_cat(
1707 miss_table_name VARCHAR2,
1708 data_table_name VARCHAR2,
1709 xform_view_name VARCHAR2,
1710 miss_schema_name VARCHAR2 DEFAULT NULL,
1711 data_schema_name VARCHAR2 DEFAULT NULL,
1712 xform_schema_name VARCHAR2 DEFAULT NULL);
1713
1714 -----------------------------------------------------------------------------
1715 -- stack_miss_cat --
1716 -----------------------------------------------------------------------------
1717 -- NAME
1718 -- stack_miss_cat - STACK CATegorical MISSing value treatment
1719 -- DESCRIPTION
1720 -- For every column in the stack definition that has a matching entry in
1721 -- the transformation definition, stacks categorical missing value
1722 -- treatment expression on top of the existing expression and updates the
1723 -- stack definition. Columns in the stack definition that have no matching
1724 -- entries in the transformation definition or have NULL expression are
1725 -- not changed. Columns in the transformation definition that have no
1726 -- matching entries in the stack definition are added to the stack
1727 -- definition (using "{col}" in place of the original expression or "VALUE"
1728 -- for nested attributes). Reverse expressions in the stack definition are
1729 -- updated accordingly, that is if an expression is updated, added or
1730 -- remains unchanged then a corresponding reverse expression is also
1731 -- updated, added or remains unchanged. When reverse expression is NULL it
1732 -- is treated as identity function ("{attr}" or "VALUE" for nested
1733 -- attributes). Given an entry ({attr}, {expr}, {rexp}) in the stack
1734 -- definition and a matching entry in the transformation definition, after
1735 -- stacking {expr} has the following form:
1736 -- NVL({expr}, {val})
1737 -- and {rexp} remains unchanged
1738 -- Example 1. Given transformation definition:
1739 -- {col = COL1, val = 'ma'}
1740 -- {col = COL2, val = 'CA'}
1741 -- and stack definition:
1742 -- {attr = COL1, expr = lower(COL1), rexp = initcap(COL1)}
1743 -- {attr = COL3, expr = upper(COL3), rexp = initcap(COL3)}
1744 -- the following updated stack definition is generated:
1745 -- {attr = COL1,
1746 -- expr = NVL(lower(COL1), 'ma'),
1747 -- rexp = initcap(COL1)}
1748 -- {attr = COL3,
1749 -- expr = upper(COL3),
1750 -- rexp = initcap(COL3)}
1751 -- {attr = COL2,
1752 -- expr = NVL(COL2, 'CA'),
1753 -- rexp = NULL}
1754 -- PARAMETERS
1755 -- miss_table_name - xform definition table
1759 -- xform_list - updated stack definition
1756 -- xform_list - stack definition
1757 -- miss_schema_name - xform definition table schema name
1758 -- RETURNS
1760 -- EXCEPTIONS
1761 -- None
1762 -- NOTES
1763 -- None
1764 -----------------------------------------------------------------------------
1765 PROCEDURE stack_miss_cat(
1766 miss_table_name VARCHAR2,
1767 xform_list IN OUT NOCOPY TRANSFORM_LIST,
1768 miss_schema_name VARCHAR2 DEFAULT NULL);
1769
1770 -----------------------------------------------------------------------------
1771 -- xform_stack --
1772 -----------------------------------------------------------------------------
1773 -- NAME
1774 -- xform_stack - XFORMation STACK
1775 -- DESCRIPTION
1776 -- Creates a view that perfoms transformation of the data table specified
1777 -- by the stack definition. Only the columns that are specified in the
1778 -- stack definition are transformed, the remaining columns do not change.
1779 -- All columns in the stack definition are applied. Columns with NULL value
1780 -- in the expression field are excluded. Columns in the stack definition
1781 -- that do not have a matching column in the data are added to the view.
1782 -- PARAMETERS
1783 -- xform_list - stack definition
1784 -- data_table_name - data table
1785 -- xform_view_name - xform view
1786 -- data_schema_name - data table schema name
1787 -- xform_schema_name - xform view schema name
1788 -- RETURNS
1789 -- None
1790 -- EXCEPTIONS
1791 -- None
1792 -- NOTES
1793 -- None
1794 -----------------------------------------------------------------------------
1795 PROCEDURE xform_stack(
1796 xform_list TRANSFORM_LIST,
1797 data_table_name VARCHAR2,
1798 xform_view_name VARCHAR2,
1799 data_schema_name VARCHAR2 DEFAULT NULL,
1800 xform_schema_name VARCHAR2 DEFAULT NULL);
1801
1802 -----------------------------------------------------------------------------
1803 -- describe_stack --
1804 -----------------------------------------------------------------------------
1805 -- NAME
1806 -- describe_stack - DESCRIBE STACK
1807 -- DESCRIPTION
1808 -- Desribes transformation of the data table specified by the stack
1809 -- definition. Only the columns that are specified in the stack definition
1810 -- are transformed, the remaining columns do not change. All columns in the
1811 -- stack definition are applied. Columns with NULL value in the expression
1812 -- field are excluded. Columns in the stack definition that do not have a
1813 -- matching column in the data are added to the describe list.
1814 -- PARAMETERS
1815 -- xform_list - stack definition
1816 -- data_table_name - data table
1817 -- describe_list - describe list/describe list3
1818 -- data_schema_name - data table schema name
1819 -- RETURNS
1820 -- None
1821 -- EXCEPTIONS
1822 -- None
1823 -- NOTES
1824 -- When the DESCRIBE_LIST type is specified for the describe_list parameter
1825 -- a DM_NESTED_NUMERICALS type is indicated by a value of nest_num_col_type
1826 -- in the col_type field and nest_cat_col_type for DM_NESTED_CATEGORICALS.
1827 --
1828 -- When the DESCRIBE_LIST3 type is specified for the describe_list
1829 -- parameter, nested types are indicated with the ADT type in the col_type
1830 -- field with the specific nested type indicated in the col_name_type
1831 -- field as a string. This behavior is consistent with the call to
1832 -- describe_columns3 in the dbms_sql package (dbmssql.sql).
1833 -----------------------------------------------------------------------------
1834 PROCEDURE describe_stack(
1835 xform_list TRANSFORM_LIST,
1836 data_table_name VARCHAR2,
1837 describe_list OUT DESCRIBE_LIST,
1838 data_schema_name VARCHAR2 DEFAULT NULL);
1839
1840 PROCEDURE describe_stack(
1841 xform_list TRANSFORM_LIST,
1842 data_table_name VARCHAR2,
1843 describe_list OUT DESCRIBE_LIST3,
1844 data_schema_name VARCHAR2 DEFAULT NULL);
1845
1846 -----------------------------------------------------------------------------
1847 -- set_expression --
1848 -----------------------------------------------------------------------------
1849 -- NAME
1850 -- set_expression - SET xform EXPRESSION
1851 -- DESCRIPTION
1852 -- Appends a VARCHAR2 chunk to the expression. Appeding NULL clears the
1853 -- expression.
1854 -- PARAMETERS
1855 -- expression - expression
1856 -- chunk - VARCHAR2 chunk
1857 -- RETURNS
1858 -- None
1859 -- EXCEPTIONS
1860 -- None
1861 -- NOTES
1862 -- None
1863 -----------------------------------------------------------------------------
1864 PROCEDURE set_expression(
1865 expression IN OUT NOCOPY EXPRESSION_REC,
1866 chunk VARCHAR2 DEFAULT NULL);
1867
1871 -- NAME
1868 -----------------------------------------------------------------------------
1869 -- get_expression --
1870 -----------------------------------------------------------------------------
1872 -- get_expression - GET xform EXPRESSION
1873 -- DESCRIPTION
1874 -- Gets a VARCHAR2 chunk from the expression. Chunks numbering starts from
1875 -- one. For chunks outside of the range the return value is NULL. When
1876 -- chunk number is NULL the whole expression is returned as a string. If
1877 -- expression is too big VALUE_ERROR is raised.
1878 -- PARAMETERS
1879 -- expression - expression
1880 -- chunk_num - chunk number
1881 -- RETURNS
1882 -- VARCHAR2 chunk.
1883 -- EXCEPTIONS
1884 -- VALUE_ERROR
1885 -- NOTES
1886 -- None
1887 -----------------------------------------------------------------------------
1888 FUNCTION get_expression(
1889 expression EXPRESSION_REC,
1890 chunk_num PLS_INTEGER DEFAULT NULL)
1891 RETURN VARCHAR2;
1892
1893 -----------------------------------------------------------------------------
1894 -- set_transform --
1895 -----------------------------------------------------------------------------
1896 -- NAME
1897 -- set_transform - SET TRANSFORM list
1898 -- DESCRIPTION
1899 -- Appends an element to the transformation list.
1900 -- PARAMETERS
1901 -- expression - expression
1902 -- chunk - VARCHAR2 chunk
1903 -- RETURNS
1904 -- None
1905 -- EXCEPTIONS
1906 -- None
1907 -- NOTES
1908 -- None
1909 -----------------------------------------------------------------------------
1910 PROCEDURE set_transform(
1911 xform_list IN OUT NOCOPY TRANSFORM_LIST,
1912 attribute_name VARCHAR2,
1913 attribute_subname VARCHAR2,
1914 expression CLOB,
1915 reverse_expression VARCHAR2,
1916 attribute_spec VARCHAR2 DEFAULT NULL);
1917 END dbms_data_mining_transform;