1 PACKAGE ctx_cls AUTHID current_user AS
2
3 /*------------------------------- TYPE DEFINITIONS ----------------------------*/
4 /* in-memory table for document assignment */
5 TYPE doc_rec IS RECORD (
6 docid number, -- document ID to identify the document
7 clusterid number, -- the ID of the cluster the document is assigned to
8 score number -- the similarity score between document and cluster
9 );
10 TYPE doc_tab is TABLE OF doc_rec INDEX BY BINARY_INTEGER;
11
12 /* in-memory table for cluster information */
13 TYPE cluster_rec IS RECORD (
14 clusterid number, -- cluster ID to identify a cluster
15 descript varchar2(4000), -- a string to describe the cluster
16 label varchar2(200), -- a suggested label for the cluster
17 sze number, -- number of documents assigned to the cluster
18 quality_score number, -- the quality score of the cluster
19 parent number -- parent cluster id. negative means no parent
20 );
21 TYPE cluster_tab IS TABLE OF cluster_rec INDEX BY BINARY_INTEGER;
22
23 TYPE docid_tab IS TABLE OF number INDEX BY BINARY_INTEGER;
24
25 /*------------------------------- train for ctx-rules --------------------------*/
26 /*
27 NAME
28 train - automatically generate ctx-rules from training examples
29
30 DESCRIPTION
31 This procedure will generate the ctx-rules for a given set of training
32 examples. The training examples are contained in the following two tables
33
34 table1: doctab must have the following columns:
35 docid number primary key
36 text doc. column which can be indexed by context index
37 table2: category table must have the following columns:
38 docid CONSTRAINT fk_id REFERENCES doctab(id)
39 category_id number
40
41 the foreign key in category table is recommended by not required.
42
43 The rules will be written to the result table specified.
44 The query table must have the following columns:
45
46 category_id number (the category_id)
47 query varchar2(4000) (the rule)
48 confidence number (the confidence level
49 (percentage) that a document
50 is relevant if this rule is
51 satisfied )
52
53 The names of table and column are not necessary the same as above.
54
55 ARGUMENTS
56 index_name - the name of the text index
57 docid - the name of docid column in document table
58 cattab - the name of category table
59 catdocid - the name of docid column in categroy table
60 catid - the name of category ID column in category table
61 restab - the name of result table
62 rescatid - the name of category ID column in result table
63 resquery - the name of query column in result table
64 resconfid - the name of confidence column in result table
65 pref_name - the name of preference
66 */
67
68 PROCEDURE train (
69 index_name in varchar2,
70 docid in varchar2,
71 cattab in varchar2,
72 catdocid in varchar2,
73 catid in varchar2,
74 restab in varchar2,
75 rescatid in varchar2,
76 resquery in varchar2,
77 resconfid in varchar2,
78 pref_name in varchar2 DEFAULT NULL
79 );
80 PRAGMA SUPPLEMENTAL_LOG_DATA(train, AUTO);
81
82 /*------------------------------- generic train API ---------------------------*/
83 /*
84 NAME
85 train - automatically generate predicative model from examples for
86 classification
87
88 DESCRIPTION
89 This procedure will generate the predicative model from a given set of
90 training examples. The training examples are contained in the following
91 two tables
92
93 table1: doctab must have the following columns:
94 docid number primary key
95 text doc. column which can be indexed by context index
96 table2: category table must have the following columns:
97 docid number
98 category_id number
99
100 The names of table and column are not necessary the same as above.
101
102 The predicative model (classifier) will be written to the result tables.
103 The result table is either created by users before calling this function or
104 created in this program with the specified table name and under the current
105 user (if the specified table does not exist).
106 If user create the result table (which can support table schema for
107 different users), the table should have the following three columns with the
108 exact column names:
109 cat_id number
110 type number(3) not null
111 rule clob
112
113 ARGUMENTS
114 index_name - the name of the text index
115 docid - the name of docid column in document table
116 cattab - the name of category table
117 catdocid - the name of docid column in categroy table
118 catid - the name of category ID column in category table
119 restab - the name of generated result table
120 pref_name - the name of preference
121 */
122 PROCEDURE train (
123 index_name in varchar2,
124 docid in varchar2,
125 cattab in varchar2,
126 catdocid in varchar2,
127 catid in varchar2,
128 restab in varchar2,
129 pref_name in varchar2
130 );
131 PRAGMA SUPPLEMENTAL_LOG_DATA(train, AUTO);
132
133
134 /* ---------- clustering API for permanent table result------------------------*/
135 /*
136 NAME
137 clustering - clustering a collection
138
139 DESCRIPTION
140 This procedure will generate a set of sub-group (clusters) from a provided
141 collection of documents. The collection is given by a table which having a
142 context index built with or without population. The collection table at least
143 has the following two collums, whose name may not be exactly the same.
144 docid number primary key
145 text doc. column which can be indexed by context index
146
147 The output of clustering is represented by two tables:
148
149 table 1: document membership table having the following collums with the
150 exact same names
151 docid number -- document ID to identify a document
152 clusterid number -- the ID of the cluster the document is assigned to
153 score number -- the similarity score between document and cluster
154
155 table 2: cluster description table having the following collums with the
156 exact same names
157 clusterid number -- cluster ID to identify a cluster
158 descript varchar2(4000) -- a string to describe the cluster
159 label varchar2(200) -- a suggested label for the cluster
160 size number -- number of documents assigned to the cluster
161 quality_score number -- the quality score of the cluster
162 parent number -- parent cluster id. negative means no parent
163
164 The output tables can either be created by users before calling this
165 function or created in this program with the specified table name and
166 under the current user (if the specified table does not exist).
167
168 ARGUMENTS:
169 index_name - the name of the text index
170 docid - the name of docid column in document table
171 doctab_name - the name of document membership table
172 clstab_name - the name of cluster description table
173 pref_name - the name of the preference
174 */
175 PROCEDURE clustering (
176 index_name in varchar2,
177 docid in varchar2,
178 doctab_name in varchar2,
179 clstab_name in varchar2,
180 pref_name in varchar2 DEFAULT NULL
181 );
182 PRAGMA SUPPLEMENTAL_LOG_DATA(clustering, AUTO);
183
184 /* ---------- clustering API for in-memory table result------------------------*/
185 /*
186 NAME
187 clustering - clustering a collection
188
189 DESCRIPTION
190 This procedure will generate a set of sub-group (clusters) from a provided
191 collection of documents. The collection is given by a table which having a
192 context index built with or without population. The collection table at least
193 has the following two collums, whose name may not be exactly the same.
194 docid number primary key
195 text doc. column which can be indexed by context index
196
197 The output of clustering is represented by two in-memory tables:
198
199 table 1: document membership table ctx_cls.doc_tab
200 table 2: cluster description table ctx_cls.cluster_tab
201
202 ARGUMENTS:
203 index_name - the name of the text index
204 docid - the name of docid column in document table
205 dids - docid list to be clustered
206 doctab_name - the name of document membership table
207 clstab_name - the name of cluster description table
208 pref_name - the name of the preference
209 */
210 PROCEDURE clustering (
211 index_name in varchar2,
212 docid in varchar2,
213 dids in docid_tab,
214 doctab_name in out nocopy doc_tab,
215 clstab_name in out nocopy cluster_tab,
216 pref_name in varchar2 DEFAULT NULL
217 );
218 --PRAGMA SUPPLEMENTAL_LOG_DATA(clustering, AUTO);
219
220 END ctx_cls;