1 PACKAGE BODY CS_KB_KWIC_UTIL AS
2 /* $Header: cskbkwicb.pls 120.3 2006/05/18 16:22:00 klou noship $ */
3
4 /*
5 *
6 * +======================================================================+
7 * | Copyright (c) 2005 Oracle Corporation |
8 * | Redwood Shores, California, USA |
9 * | All rights reserved. |
10 * +======================================================================+
11 *
12 * FILENAME
13 * cskbkwicb.pls
14 * PURPOSE
15 * Creates the package body for CS_KB_KWIC_UTIL
16 * CS_KB_KWIC_UTIL supports the Keywords In Context implementation
17 *
18 * HISTORY
19 * 12-APR-2005 HMEI Created.
20 * 11-JUL-2005 HMEI Changed get_segment_kwic to use different regular
21 * expressions for pre/post segement. Removed
22 * string reverse logic
23 * 12-JUL-2005 HMEI Changed get_segment_kwic to use custom logic for
24 * determining pre-segment. Cannot use a single
25 * pattern ending in '$', since performance suffers.
26 * 18-JUN-2006 KLOU Fix bug 5217204
27 * Change size of l_kwic_segment to 32000 as the regexp_replace
28 * can return up to 32K of content.
29 */
30
31
32 -- Maximum keyword segment length
33 MAX_SEGMENT_LENGTH CONSTANT NUMBER := 200;
34 -- Left and Right padding used when calculating KWIC
35 SEGMENT_PADDING CONSTANT NUMBER := MAX_SEGMENT_LENGTH/2;
36
37 --
38 -- This API highlights all keywords (p_text_query) in a given document
39 -- (p_document) using the specified start and end tags. This API is intended
40 -- for highlighting a short document, e.g. solution title or SR summary.
41 --
42 -- Parameters:
43 -- p_text_query VARCHAR2, keywords used for highlighting
44 -- p_document VARCHAR2, document to be highlighted
45 -- p_starttag VARCHAR2, start highlighting tag
46 -- p_endtag VARCHAR2, end highlighting tag
47 -- Returns:
48 -- The highlighted document
49 -- Since 12.0
50 --
51 FUNCTION highlight_text
52 (
53 p_text_query IN VARCHAR2,
54 p_document IN VARCHAR2,
55 p_starttag IN VARCHAR2,
56 p_endtag IN VARCHAR2
57 ) RETURN VARCHAR2
58 IS
59 l_result_segment CLOB;
60 BEGIN
61 CTX_DOC.POLICY_MARKUP (
62 policy_name => 'CS_KB_KWIC_POLICY',
63 document => p_document,
64 text_query => p_text_query,
65 starttag => p_starttag,
66 endtag => p_endtag,
67 restab => l_result_segment);
68 return l_result_segment;
69 END highlight_text;
70
71 --
72 -- This Utility method escapes regular expression operators in a string.
73 -- This is useful if you want to literally match a string that may contain
74 -- a regular expression operator. (e.g. 'e*trade')
75 --
76 -- Parameters:
77 -- p_string VARCHAR2, the string to escape
78 -- Returns:
79 -- string with regexp special characters escaped.
80 -- Since 12.0
81 --
82 FUNCTION regexp_escape
83 (
84 p_string IN VARCHAR2
85 ) RETURN VARCHAR2
86 IS
87 l_string VARCHAR2(32767); -- the final string after escaping
88 l_special_chars VARCHAR2(100); --chars to escape
89 l_char VARCHAR2(8);
90 BEGIN
91 l_string := p_string;
92 l_special_chars := '\[](){}*+$^?|.'; -- '\' must come first
93
94 FOR i in 1..length(l_special_chars) LOOP
95 l_char := substr(l_special_chars, i, 1);
96 l_string := replace(l_string, l_char, '\' || l_char);
97 END LOOP;
98
99 return l_string;
100 END regexp_escape;
101
102 --
103 -- This API highlights all keywords (p_text_query) in a given document
104 -- (p_document) using the specified start and end tags. It scans the
105 -- document to construct a highlighted segment that contains the most
106 -- number of distinct keywords, with some surrounding context.
107 -- If no keywords found, will return NULL
108 --
109 -- Parameters:
110 -- p_text_query VARCHAR2, keywords used for highlighting
111 -- p_starttag VARCHAR2, start highlighting tag
112 -- p_endtag VARCHAR2, end highlighting tag
113 -- p_document VARCHAR2, document to be highlighted
114 -- Returns:
115 -- The highlighted KWIC (Key Words In Context) segment,
116 -- or NULL if no keywords found.
117 -- Since 12.0
118 --
119 FUNCTION get_segment_kwic
120 (
121 p_text_query IN VARCHAR2,
122 p_starttag IN VARCHAR2,
123 p_endtag IN VARCHAR2,
124 p_document IN CLOB
125 ) RETURN VARCHAR2
126 IS
127 l_restab CTX_DOC.highlight_tab; -- Keyword match info returned by CTX_DOC
128
129 TYPE number_table IS TABLE OF INTEGER INDEX BY PLS_INTEGER;
130 -- word occupies the range [open, close)
131 l_open_positions number_table; -- open positions of matching words
132 l_close_positions number_table; -- close positions of matching words
133
134 i INTEGER;
135 idx INTEGER;
136 openIdx INTEGER; -- index for looping through the open positions
137 closeIdx INTEGER; -- index for looping through the close positions
138
139 -- Map to hold distinct keywords
140 TYPE distinct_word_map IS TABLE OF INTEGER INDEX BY VARCHAR2(32767);
141 l_distinct_words_master distinct_word_map; -- master list of distinct words
142 l_distinct_words distinct_word_map; -- distinct words for a given segment
143
144 l_word VARCHAR2(32767);
145 l_len INTEGER;
146
147 --used when calculating the best segment
148 l_word_start INTEGER;
149 l_word_end INTEGER;
150 l_segment_start INTEGER;
151 l_segment_end INTEGER;
152
153 --represent the best segment
154 l_best_count INTEGER := 0; --best # of distinct keywords in any segment
155 l_best_segment_start INTEGER := null;
156 l_best_segment_end INTEGER := null;
157 l_best_segment VARCHAR2(1000);
158
159 --pre and post segments (the "Context" part of KWIC)
160 l_pre_segment VARCHAR2(1000);
161 l_post_segment VARCHAR2(1000);
162 l_pre_segment_start INTEGER;
163 l_post_segment_end INTEGER;
164
165 l_space_positions number_table; -- used to calculate pre-segement
166 l_pre_pattern VARCHAR2(50); -- regular expression
167 l_post_pattern VARCHAR2(50); -- regular expression
168
169 -- (5217204)
170 l_kwic_segment VARCHAR2(32000); -- the final kwic segment
171 BEGIN
172
173 -- positions of matching words returned in l_restab
174 CTX_DOC.POLICY_HIGHLIGHT (
175 policy_name => 'CS_KB_KWIC_POLICY',
176 document => p_document,
177 text_query => p_text_query,
178 restab => l_restab );
179
180 -- form master list of distinct keywords found in the document
181 i := l_restab.FIRST;
182 WHILE (i IS NOT NULL) LOOP
183 l_open_positions(i) := l_restab(i).offset;
184 l_close_positions(i) := l_restab(i).offset + l_restab(i).length;
185
186 dbms_lob.read(p_document,
187 l_restab(i).length,
188 l_restab(i).offset,
189 l_word );
190
191 l_distinct_words_master( LOWER(l_word) ) := 1; --keys are the keywords
192
193 i := l_restab.NEXT(i);
194 END LOOP;
195
196 --dbms_output.put_line('# Distinct keywords:' || l_distinct_words_master.COUNT);
197
198 -- Find the best segment (that with the most distinct keywords)
199 -- Loop through open and close positions and form the largest segment
200 -- possible. For each segment, keep track of distinct keywords. Note
201 -- each segment begins and ends with a matching keyword
202 openIdx := l_open_positions.FIRST;
203 WHILE (openIdx IS NOT NULL AND
204 l_best_count < l_distinct_words_master.COUNT
205 ) LOOP
206 --segment start and end positions for this iteration
207 l_segment_start := l_open_positions(openIdx);
208 l_segment_end := null;
209
210 l_distinct_words.DELETE; -- clear out keyword map.
211
212 closeIdx := openIdx; -- slide the closeIdx to determine the segment close.
213 WHILE (closeIdx IS NOT NULL AND
214 l_close_positions(closeIdx) - l_segment_start <= MAX_SEGMENT_LENGTH
215 ) LOOP
216 l_segment_end := l_close_positions(closeIdx); --current end of segment
217
218 -- read the current keyword
219 l_word_start := l_open_positions(closeIdx);
220 l_word_end := l_close_positions(closeIdx);
221 l_len := l_word_end - l_word_start;
222 dbms_lob.read(p_document,
223 l_len,
224 l_word_start,
225 l_word );
226 --dbms_output.put_line(l_word_start || ' ' || l_word_end || ' [' || l_word || ']');
227 l_distinct_words( LOWER(l_word) ) := 1; -- add keyword to map
228
229 closeIdx := l_close_positions.NEXT(closeIdx);
230 END LOOP;
231
232 -- Note: A Corner Case:
233 -- l_segment_end is null if the keyword length > MAX_SEGMENT_LENGTH
234
235 if(l_distinct_words.COUNT > l_best_count AND
236 l_segment_end IS NOT NULL)
237 then
238 l_best_count := l_distinct_words.COUNT;
239 l_best_segment_start := l_segment_start;
240 l_best_segment_end := l_segment_end;
241 end if;
242 --dbms_output.put_line( '[' || l_segment_start || ',' || l_segment_end || '] = ' || l_distinct_words.COUNT);
243
244 if(closeIdx IS NOT NULL AND
245 l_segment_end IS NOT NULL AND
246 l_close_positions(closeIdx) - l_segment_end >= MAX_SEGMENT_LENGTH)
247 then
248 openIdx := l_open_positions.NEXT(closeIdx); --look-ahead optimization
249 --dbms_output.put_line( 'Looking AHEAD');
250 else
251 openIdx := l_open_positions.NEXT(openIdx);
252 end if;
253
254 END LOOP;
255
256 -- could not find a best segment
257 if(l_best_segment_start IS NULL) then return NULL; end if;
258
259 -- now calculate the context surrounding the snippet
260 -- find pre segment start position
261 l_pre_segment_start := l_best_segment_start - SEGMENT_PADDING;
262 if(l_pre_segment_start < 1) then
263 l_pre_segment_start := 1;
264 end if;
265 -- find the post segment end position
266 l_post_segment_end := l_best_segment_end + SEGMENT_PADDING;
267 l_len := dbms_lob.getlength(p_document);
268 if(l_post_segment_end > l_len) then
269 l_post_segment_end := l_len + 1; --last position + 1
270 end if;
271
272 -- get the middle segment (the one with the densest keywords)
273 l_len := l_best_segment_end - l_best_segment_start;
274 dbms_lob.read(p_document,
275 l_len,
276 l_best_segment_start,
277 l_best_segment );
278
279 -- now get the prefix segment
280 l_len := l_best_segment_start - l_pre_segment_start;
281 if(l_len > 0) then
282 dbms_lob.read(p_document,
283 l_len,
284 l_pre_segment_start,
285 l_pre_segment );
286 end if;
287
288 -- now get the postfix segment
289 l_len := l_post_segment_end - l_best_segment_end;
290 if(l_len > 0) then
291 dbms_lob.read(p_document,
292 l_len,
293 l_best_segment_end,
294 l_post_segment );
295 end if;
296
297 -- dbms_output.put_line('PRE: [' || l_pre_segment || ']');
298
299 -- Now refine the pre and post segments
300 -- try to match up to 8 words on either side (pre and post)
301
302 --NOTE: cannot use a single expression ending in '$'. This
303 -- has terrible performance, probably due to excessive backtracking
304 --l_pre_pattern := '([^[:space:]]+[[:space:]]*){1,8}$'; -- greedy
305 l_pre_pattern := '[[:space:]]+'; -- just match spaces
306
307 -- Refine the pre segment
308 -- Loop find all the occurences spaces in the pre-segment
309 -- Trim the pre-segment to include the last N words.
310 l_len := 1;
311 i := 0;
312 WHILE(l_len > 0) LOOP
313 l_len := regexp_instr( l_pre_segment,
314 l_pre_pattern,
315 l_len, -- start search at position
316 1, -- return the first occurence
317 1 -- return the position of last character + 1
318 );
319 IF (l_len > 0) THEN
320 l_space_positions(i) := l_len;
321 i := i + 1;
322 END IF;
323
324 END LOOP;
325
326 IF( i > 0 ) THEN
327 i := i - 8; -- get up to eight words preceding the main snippet
328 IF( i < 0 ) THEN i := 0; END IF;
329 l_pre_segment := SUBSTR (l_pre_segment, l_space_positions(i));
330 END IF;
331
332 /*l_len := regexp_instr( l_pre_segment,
333 l_pre_pattern,
334 1, -- start search at position 1
335 1, -- return the first occurence
336 0 -- return the position of first char in match
337 );
338 IF(l_len > 0) THEN
339 l_pre_segment := SUBSTR ( l_pre_segment, l_len );
340 ELSE
341 l_pre_segment := '';
342 END IF;
343 */
344
345 -- dbms_output.put_line('NEW PRE: [' || l_pre_segment || ']');
346 -- dbms_output.put_line('MID: [' || l_best_segment || ']');
347 -- dbms_output.put_line('POST: [' || l_post_segment || ']');
348
349 l_post_pattern := '([[:space:]]*[^[:space:]]+){1,8}'; -- greedy
350 -- Refine the post segment
351 l_len := regexp_instr( l_post_segment,
352 l_post_pattern,
353 1, -- start search at position 1
354 1, -- return the first occurence
355 1 -- return the position of last character + 1
356 );
357 IF(l_len > 0) THEN
358 l_post_segment := SUBSTR ( l_post_segment, 1, l_len - 1 );
359 ELSE
360 l_post_segment := '';
361 END IF;
362
363
364 -- dbms_output.put_line('NEW POST: [' || l_post_segment || ']');
365 -- combine the pre, mid, and post to form the KWIC segment
366 l_kwic_segment := l_pre_segment || l_best_segment || l_post_segment;
367
368 -- surround keywords with the start and end tag, case insensitive
369 l_word := l_distinct_words_master.FIRST;
370 while(l_word IS NOT NULL) LOOP
371 --dbms_output.put_line(l_word || ' ==> ' || regexp_escape(l_word));
375 p_starttag || '\1' || p_endtag,
372 --cannot use regular replace(): need case insensitive match/replace
373 l_kwic_segment := regexp_replace(l_kwic_segment,
374 '(' || regexp_escape(l_word) || ')',
376 1, --start index
377 0, --occurences (0 means ALL)
378 'i' --case insensitive
379 );
380 l_word := l_distinct_words_master.NEXT(l_word);
381 END LOOP;
382
383 return l_kwic_segment;
384 END get_segment_kwic;
385
386 -- This function synthesizes a snippet from the SR notes of the given p_sr_id
387 -- and highlights it with the keywords (p_text_query)
388 -- using the given tags (p_starttag and p_endtag);
389 -- Note: this function does not include private notes of the SR
390 --
391 -- Parameters:
392 -- p_sr_id NUMBER, service request id
393 -- p_text_query VARCHAR2, keywords used for highlighting
394 -- p_starttag VARCHAR2, start highlighting tag
395 -- p_endtag VARCHAR2, end highlighting tag
396 -- Returns:
397 -- Snippet with the keywords wrapped in tags
398 -- Since R12
399 FUNCTION get_sr_snippet
400 (
401 p_sr_id IN NUMBER,
402 p_text_query IN VARCHAR2,
403 p_starttag IN VARCHAR2,
404 p_endtag IN VARCHAR2
405 ) RETURN VARCHAR2
406 IS
407 l_document CLOB; -- the document from which the snippet will be calculated
408 l_data VARCHAR2(32767);
409 l_len NUMBER;
410
411 CURSOR get_sr_notes(c_obj_id NUMBER) IS
412 SELECT notes, notes_detail
413 FROM jtf_notes_vl
414 WHERE source_object_code = 'SR'
415 AND source_object_id = c_obj_id
416 AND note_status <> 'P'; -- IN ('E', 'I'); ignore private notes
417
418 BEGIN
419 --Synthesize the document out of SR notes
420 dbms_lob.createtemporary(l_document, TRUE, dbms_lob.call);
421 FOR element IN get_sr_notes( p_sr_id ) LOOP
422 --append the note (VARCHAR)
423 l_data := element.notes;
424 IF(l_data IS NOT NULL AND
425 length (l_data) > 0) then
426 dbms_lob.writeappend(l_document, length(l_data)+3, '...' || l_data);
427 END IF;
428
429 --append the note detail (CLOB)
430 IF(element.notes_detail IS NOT NULL AND
431 dbms_lob.getlength(element.notes_detail) > 0)
432 THEN
433 dbms_lob.writeappend(l_document, 3, '...');
434 dbms_lob.append(l_document, element.notes_detail);
435 END IF;
436
437 END LOOP;
438
439 -- dbms_output.put_line('DOC_LENGTH = ' || dbms_lob.getlength(l_document));
440 -- dbms_output.put_line(substr(l_document,1,255));
441
442 return get_segment_kwic(p_text_query, p_starttag, p_endtag, l_document);
443
444 END get_sr_snippet;
445
446 --
447 -- This function synthesizes the solution statements of the given p_set_id
448 -- and highlights it with the keywords (p_text_query)
449 -- using the given tags (p_starttag and p_endtag);
450 -- Statements used in this function should respect solution security.
451 --
452 -- Parameters:
453 -- p_set_id NUMBER, set id
454 -- p_text_query VARCHAR2, keywords used for highlighting
455 -- p_starttag VARCHAR2, start highlighting tag
456 -- p_endtag VARCHAR2, end highlighting tag
457 -- Returns:
458 -- Snippet with the keywords wrapped in tags
459 -- Since R12
460 --
461 FUNCTION get_set_snippet
462 (
463 p_set_id IN NUMBER,
464 p_text_query IN VARCHAR2,
465 p_starttag IN VARCHAR2,
466 p_endtag IN VARCHAR2
467 ) RETURN VARCHAR2
468 IS
469 l_document CLOB; -- the document from which the snippet will be calculated
470 l_data VARCHAR2(32767);
471 l_len NUMBER;
472
473 CURSOR get_element_content(p_set_id NUMBER) IS
474 SELECT e.name, e.description
475 FROM cs_kb_set_eles b, cs_kb_elements_vl e
476 WHERE e.element_id = b.element_id
477 AND b.set_id = p_set_id
478 AND e.status = 'PUBLISHED'
479 AND e.access_level >= Cs_Kb_Security_Pvt.GET_STMT_VISIBILITY_POSITION;
480
481 BEGIN
482 --Synthesize the document out of solution statements
483 dbms_lob.createtemporary(l_document, TRUE, dbms_lob.call);
484
485 FOR element IN get_element_content( p_set_id ) LOOP
486 --append the statement
487 l_data := element.name;
488 IF(l_data IS NOT NULL AND
489 length (l_data) > 0) then
490 dbms_lob.writeappend(l_document, length(l_data)+3, '...' || l_data);
491 END IF;
492
493 --append the description (CLOB)
494 IF(element.description IS NOT NULL AND
495 dbms_lob.getlength(element.description) > 0)
496 THEN
497 dbms_lob.writeappend(l_document, 3, '...');
498 dbms_lob.append(l_document, element.description);
499 END IF;
500
501 END LOOP;
502
503 -- dbms_output.put_line('DOC_LENGTH = ' || dbms_lob.getlength(l_document));
504 -- dbms_output.put_line(substr(l_document,1,255));
505
506 return get_segment_kwic(p_text_query, p_starttag, p_endtag, l_document);
507
508 END get_set_snippet;
509
510
511 end CS_KB_KWIC_UTIL ;