4 /*
1 PACKAGE BODY CS_KB_KWIC_UTIL AS
2 /* $Header: cskbkwicb.pls 120.5.12020000.2 2012/07/05 05:37:08 gasankar ship $ */
3
5 *
6 * +======================================================================+
7 * | Copyright (c) 2005 Oracle Corporation |
8 * | Redwood Shores, California, USA |
9 * | All rights reserved. |
10 * +======================================================================+
11 *
12 * FILENAME
13 * cskbkwicb.pls
14 * PURPOSE
15 * Creates the package body for CS_KB_KWIC_UTIL
16 * CS_KB_KWIC_UTIL supports the Keywords In Context implementation
17 *
18 * HISTORY
19 * 12-APR-2005 HMEI Created.
20 * 11-JUL-2005 HMEI Changed get_segment_kwic to use different regular
21 * expressions for pre/post segement. Removed
22 * string reverse logic
23 * 12-JUL-2005 HMEI Changed get_segment_kwic to use custom logic for
24 * determining pre-segment. Cannot use a single
25 * pattern ending in '$', since performance suffers.
26 * 18-JUN-2006 KLOU Fix bug 5217204
27 * Change size of l_kwic_segment to 32000 as the regexp_replace
28 * can return up to 32K of content.
29 * 06-May-2011 isugavan Bug 11837564 - WEIRD CHARACTERS SHOWS UP IN THE
30 * SEARCH RESULT WHEN WE SEARCH KNOWLEDGE BASE DOC
31 */
32
33
34 -- Maximum keyword segment length
35 MAX_SEGMENT_LENGTH CONSTANT NUMBER := 200;
36 -- Left and Right padding used when calculating KWIC
37 SEGMENT_PADDING CONSTANT NUMBER := MAX_SEGMENT_LENGTH/2;
38
39 --
40 -- This API highlights all keywords (p_text_query) in a given document
41 -- (p_document) using the specified start and end tags. This API is intended
42 -- for highlighting a short document, e.g. solution title or SR summary.
43 --
44 -- Parameters:
45 -- p_text_query VARCHAR2, keywords used for highlighting
46 -- p_document VARCHAR2, document to be highlighted
47 -- p_starttag VARCHAR2, start highlighting tag
48 -- p_endtag VARCHAR2, end highlighting tag
49 -- Returns:
50 -- The highlighted document
51 -- Since 12.0
52 --
53 FUNCTION highlight_text
54 (
55 p_text_query IN VARCHAR2,
56 p_document IN VARCHAR2,
57 p_starttag IN VARCHAR2,
58 p_endtag IN VARCHAR2
59 ) RETURN VARCHAR2
60 IS
61 l_result_segment CLOB;
62 BEGIN
63 CTX_DOC.POLICY_MARKUP (
64 policy_name => 'CS_KB_KWIC_POLICY',
65 document => p_document,
66 text_query => p_text_query,
67 starttag => p_starttag,
68 endtag => p_endtag,
69 restab => l_result_segment);
70 return l_result_segment;
71 END highlight_text;
72
73 --
74 -- This Utility method escapes regular expression operators in a string.
75 -- This is useful if you want to literally match a string that may contain
76 -- a regular expression operator. (e.g. 'e*trade')
77 --
78 -- Parameters:
79 -- p_string VARCHAR2, the string to escape
80 -- Returns:
81 -- string with regexp special characters escaped.
82 -- Since 12.0
83 --
84 FUNCTION regexp_escape
85 (
86 p_string IN VARCHAR2
87 ) RETURN VARCHAR2
88 IS
89 l_string VARCHAR2(32767); -- the final string after escaping
90 l_special_chars VARCHAR2(100); --chars to escape
91 l_char VARCHAR2(8);
92 BEGIN
93 l_string := p_string;
94 l_special_chars := '\[](){}*+$^?|.'; -- '\' must come first
95
96 FOR i in 1..length(l_special_chars) LOOP
97 l_char := substr(l_special_chars, i, 1);
98 l_string := replace(l_string, l_char, '\' || l_char);
99 END LOOP;
100
101 return l_string;
102 END regexp_escape;
103
104 --
105 -- This API highlights all keywords (p_text_query) in a given document
106 -- (p_document) using the specified start and end tags. It scans the
107 -- document to construct a highlighted segment that contains the most
108 -- number of distinct keywords, with some surrounding context.
109 -- If no keywords found, will return NULL
110 --
111 -- Parameters:
112 -- p_text_query VARCHAR2, keywords used for highlighting
113 -- p_starttag VARCHAR2, start highlighting tag
114 -- p_endtag VARCHAR2, end highlighting tag
115 -- p_document VARCHAR2, document to be highlighted
116 -- Returns:
120 --
117 -- The highlighted KWIC (Key Words In Context) segment,
118 -- or NULL if no keywords found.
119 -- Since 12.0
121 FUNCTION get_segment_kwic
122 (
123 p_text_query IN VARCHAR2,
124 p_starttag IN VARCHAR2,
125 p_endtag IN VARCHAR2,
126 p_document IN CLOB
127 ) RETURN VARCHAR2
128 IS
129 l_restab CTX_DOC.highlight_tab; -- Keyword match info returned by CTX_DOC
130
131 TYPE number_table IS TABLE OF INTEGER INDEX BY PLS_INTEGER;
132 -- word occupies the range [open, close)
133 l_open_positions number_table; -- open positions of matching words
134 l_close_positions number_table; -- close positions of matching words
135
136 i INTEGER;
137 idx INTEGER;
138 openIdx INTEGER; -- index for looping through the open positions
139 closeIdx INTEGER; -- index for looping through the close positions
140
141 -- Map to hold distinct keywords
142 TYPE distinct_word_map IS TABLE OF INTEGER INDEX BY VARCHAR2(32767);
143 l_distinct_words_master distinct_word_map; -- master list of distinct words
144 l_distinct_words distinct_word_map; -- distinct words for a given segment
145
146 l_word VARCHAR2(32767);
147 l_len INTEGER;
148
149 --used when calculating the best segment
150 l_word_start INTEGER;
151 l_word_end INTEGER;
152 l_segment_start INTEGER;
153 l_segment_end INTEGER;
154
155 --represent the best segment
156 l_best_count INTEGER := 0; --best # of distinct keywords in any segment
157 l_best_segment_start INTEGER := null;
158 l_best_segment_end INTEGER := null;
159 l_best_segment VARCHAR2(1000);
160
161 --pre and post segments (the "Context" part of KWIC)
162 l_pre_segment VARCHAR2(1000);
163 l_post_segment VARCHAR2(1000);
164 l_pre_segment_start INTEGER;
165 l_post_segment_end INTEGER;
166
167 l_space_positions number_table; -- used to calculate pre-segement
168 l_pre_pattern VARCHAR2(50); -- regular expression
169 l_post_pattern VARCHAR2(50); -- regular expression
170
171 -- (5217204)
172 l_kwic_segment VARCHAR2(32000); -- the final kwic segment
173
174 -- Bug fix 11837564
175 l_char varchar2(20);
176 l_left Number := 0;
177 l_right Number := 0;
178 l_length Number;
179
180 BEGIN
181
182 -- positions of matching words returned in l_restab
183 CTX_DOC.POLICY_HIGHLIGHT (
184 policy_name => 'CS_KB_KWIC_POLICY',
185 document => p_document,
186 text_query => p_text_query,
187 restab => l_restab );
188
189 -- form master list of distinct keywords found in the document
190 i := l_restab.FIRST;
191 WHILE (i IS NOT NULL) LOOP
192 l_open_positions(i) := l_restab(i).offset;
193 l_close_positions(i) := l_restab(i).offset + l_restab(i).length;
194
195 dbms_lob.read(p_document,
196 l_restab(i).length,
197 l_restab(i).offset,
198 l_word );
199
200 l_distinct_words_master( LOWER(l_word) ) := 1; --keys are the keywords
201
202 i := l_restab.NEXT(i);
203 END LOOP;
204
205 --dbms_output.put_line('# Distinct keywords:' || l_distinct_words_master.COUNT);
206
207 -- Find the best segment (that with the most distinct keywords)
208 -- Loop through open and close positions and form the largest segment
209 -- possible. For each segment, keep track of distinct keywords. Note
210 -- each segment begins and ends with a matching keyword
211 openIdx := l_open_positions.FIRST;
212 WHILE (openIdx IS NOT NULL AND
213 l_best_count < l_distinct_words_master.COUNT
214 ) LOOP
215 --segment start and end positions for this iteration
216 l_segment_start := l_open_positions(openIdx);
217 l_segment_end := null;
218
219 l_distinct_words.DELETE; -- clear out keyword map.
220
221 closeIdx := openIdx; -- slide the closeIdx to determine the segment close.
222 WHILE (closeIdx IS NOT NULL AND
223 l_close_positions(closeIdx) - l_segment_start <= MAX_SEGMENT_LENGTH
224 ) LOOP
225 l_segment_end := l_close_positions(closeIdx); --current end of segment
226
227 -- read the current keyword
228 l_word_start := l_open_positions(closeIdx);
229 l_word_end := l_close_positions(closeIdx);
230 l_len := l_word_end - l_word_start;
231 dbms_lob.read(p_document,
232 l_len,
233 l_word_start,
234 l_word );
235 --dbms_output.put_line(l_word_start || ' ' || l_word_end || ' [' || l_word || ']');
236 l_distinct_words( LOWER(l_word) ) := 1; -- add keyword to map
237
238 closeIdx := l_close_positions.NEXT(closeIdx);
239 END LOOP;
240
241 -- Note: A Corner Case:
242 -- l_segment_end is null if the keyword length > MAX_SEGMENT_LENGTH
243
244 if(l_distinct_words.COUNT > l_best_count AND
245 l_segment_end IS NOT NULL)
246 then
247 l_best_count := l_distinct_words.COUNT;
248 l_best_segment_start := l_segment_start;
249 l_best_segment_end := l_segment_end;
250 end if;
251 --dbms_output.put_line( '[' || l_segment_start || ',' || l_segment_end || '] = ' || l_distinct_words.COUNT);
252
253 if(closeIdx IS NOT NULL AND
254 l_segment_end IS NOT NULL AND
255 l_close_positions(closeIdx) - l_segment_end >= MAX_SEGMENT_LENGTH)
256 then
257 openIdx := l_open_positions.NEXT(closeIdx); --look-ahead optimization
258 --dbms_output.put_line( 'Looking AHEAD');
262
259 else
260 openIdx := l_open_positions.NEXT(openIdx);
261 end if;
263 END LOOP;
264
265 -- could not find a best segment
266 if(l_best_segment_start IS NULL) then return NULL; end if;
267
268 -- now calculate the context surrounding the snippet
269 -- find pre segment start position
270 l_pre_segment_start := l_best_segment_start - SEGMENT_PADDING;
271 if(l_pre_segment_start < 1) then
272 l_pre_segment_start := 1;
273 end if;
274 -- find the post segment end position
275 l_post_segment_end := l_best_segment_end + SEGMENT_PADDING;
276 l_len := dbms_lob.getlength(p_document);
277 if(l_post_segment_end > l_len) then
278 l_post_segment_end := l_len + 1; --last position + 1
279 end if;
280
281 -- get the middle segment (the one with the densest keywords)
282 l_len := l_best_segment_end - l_best_segment_start;
283 dbms_lob.read(p_document,
284 l_len,
285 l_best_segment_start,
286 l_best_segment );
287
288 -- now get the prefix segment
289 l_len := l_best_segment_start - l_pre_segment_start;
290 if(l_len > 0) then
291 dbms_lob.read(p_document,
292 l_len,
293 l_pre_segment_start,
294 l_pre_segment );
295 end if;
296
297 -- now get the postfix segment
298 l_len := l_post_segment_end - l_best_segment_end;
299 if(l_len > 0) then
300 dbms_lob.read(p_document,
301 l_len,
302 l_best_segment_end,
303 l_post_segment );
304 end if;
305
306 -- dbms_output.put_line('PRE: [' || l_pre_segment || ']');
307
308 -- Now refine the pre and post segments
309 -- try to match up to 8 words on either side (pre and post)
310
311 --NOTE: cannot use a single expression ending in '$'. This
312 -- has terrible performance, probably due to excessive backtracking
313 --l_pre_pattern := '([^[:space:]]+[[:space:]]*){1,8}$'; -- greedy
314 l_pre_pattern := '[[:space:]]+'; -- just match spaces
315
316 -- Refine the pre segment
317 -- Loop find all the occurences spaces in the pre-segment
318 -- Trim the pre-segment to include the last N words.
319 l_len := 1;
320 i := 0;
321 WHILE(l_len > 0) LOOP
322 l_len := regexp_instr( l_pre_segment,
323 l_pre_pattern,
324 l_len, -- start search at position
325 1, -- return the first occurence
326 1 -- return the position of last character + 1
327 );
328 IF (l_len > 0) THEN
329 l_space_positions(i) := l_len;
330 i := i + 1;
331 END IF;
332
333 END LOOP;
334
335 IF( i > 0 ) THEN
336 i := i - 8; -- get up to eight words preceding the main snippet
337 IF( i < 0 ) THEN i := 0; END IF;
338 l_pre_segment := SUBSTR (l_pre_segment, l_space_positions(i));
339 END IF;
340 -- Bug fix 11837564 Start
341 if(l_pre_segment is not null) then
342 l_length := length(l_pre_segment);
343 For i in 1.. l_length loop
344 l_char := substr(l_pre_segment,i,1);
345 If l_char = '<' then
346 l_left := l_left + 1;
347 End if;
348 If l_char = '>' then
349 l_right := l_right + 1;
350 End if;
351 end loop;
352 If l_right > l_left then
353 l_pre_segment := '<' || l_pre_segment;
354 End if;
355 End if;
356 -- Bug fix 11837564 End
357 /*l_len := regexp_instr( l_pre_segment,
358 l_pre_pattern,
359 1, -- start search at position 1
360 1, -- return the first occurence
361 0 -- return the position of first char in match
362 );
363 IF(l_len > 0) THEN
364 l_pre_segment := SUBSTR ( l_pre_segment, l_len );
365 ELSE
366 l_pre_segment := '';
367 END IF;
368 */
369
370 -- dbms_output.put_line('NEW PRE: [' || l_pre_segment || ']');
371 -- dbms_output.put_line('MID: [' || l_best_segment || ']');
372 -- dbms_output.put_line('POST: [' || l_post_segment || ']');
373
374 l_post_pattern := '([[:space:]]*[^[:space:]]+){1,8}'; -- greedy
375 -- Refine the post segment
376 l_len := regexp_instr( l_post_segment,
377 l_post_pattern,
378 1, -- start search at position 1
379 1, -- return the first occurence
380 1 -- return the position of last character + 1
381 );
382 IF(l_len > 0) THEN
383 l_post_segment := SUBSTR ( l_post_segment, 1, l_len - 1 );
384 ELSE
385 l_post_segment := '';
386 END IF;
387
388
389 -- dbms_output.put_line('NEW POST: [' || l_post_segment || ']');
390 -- combine the pre, mid, and post to form the KWIC segment
391 l_kwic_segment := l_pre_segment || l_best_segment || l_post_segment;
392
393 -- surround keywords with the start and end tag, case insensitive
394 l_word := l_distinct_words_master.FIRST;
395 while(l_word IS NOT NULL) LOOP
396 --dbms_output.put_line(l_word || ' ==> ' || regexp_escape(l_word));
397 --cannot use regular replace(): need case insensitive match/replace
398 l_kwic_segment := regexp_replace(l_kwic_segment,
402 0, --occurences (0 means ALL)
399 '(' || regexp_escape(l_word) || ')',
400 p_starttag || '\1' || p_endtag,
401 1, --start index
403 'i' --case insensitive
404 );
405 l_word := l_distinct_words_master.NEXT(l_word);
406 END LOOP;
407
408 return l_kwic_segment;
409 END get_segment_kwic;
410
411 -- This function synthesizes a snippet from the SR notes of the given p_sr_id
412 -- and highlights it with the keywords (p_text_query)
413 -- using the given tags (p_starttag and p_endtag);
414 -- Note: this function does not include private notes of the SR
415 --
416 -- Parameters:
417 -- p_sr_id NUMBER, service request id
418 -- p_text_query VARCHAR2, keywords used for highlighting
419 -- p_starttag VARCHAR2, start highlighting tag
420 -- p_endtag VARCHAR2, end highlighting tag
421 -- Returns:
422 -- Snippet with the keywords wrapped in tags
423 -- Since R12
424 FUNCTION get_sr_snippet
425 (
426 p_sr_id IN NUMBER,
427 p_text_query IN VARCHAR2,
428 p_starttag IN VARCHAR2,
429 p_endtag IN VARCHAR2
430 ) RETURN VARCHAR2
431 IS
432 l_document CLOB; -- the document from which the snippet will be calculated
433 l_data VARCHAR2(32767);
434 l_len NUMBER;
435
436 CURSOR get_sr_notes(c_obj_id NUMBER) IS
437 SELECT notes, notes_detail
438 FROM jtf_notes_vl
439 WHERE source_object_code = 'SR'
440 AND source_object_id = c_obj_id
441 AND note_status <> 'P'; -- IN ('E', 'I'); ignore private notes
442
443 BEGIN
444 --Synthesize the document out of SR notes
445 dbms_lob.createtemporary(l_document, TRUE, dbms_lob.call);
446 FOR element IN get_sr_notes( p_sr_id ) LOOP
447 --append the note (VARCHAR)
448 l_data := element.notes;
449 IF(l_data IS NOT NULL AND
450 length (l_data) > 0) then
451 dbms_lob.writeappend(l_document, length(l_data)+3, '...' || l_data);
452 END IF;
453
454 --append the note detail (CLOB)
455 IF(element.notes_detail IS NOT NULL AND
456 dbms_lob.getlength(element.notes_detail) > 0)
457 THEN
458 dbms_lob.writeappend(l_document, 3, '...');
459 dbms_lob.append(l_document, element.notes_detail);
460 END IF;
461
462 END LOOP;
463
464 -- dbms_output.put_line('DOC_LENGTH = ' || dbms_lob.getlength(l_document));
465 -- dbms_output.put_line(substr(l_document,1,255));
466
467 return get_segment_kwic(p_text_query, p_starttag, p_endtag, l_document);
468
469 END get_sr_snippet;
470
471 --
472 -- This function synthesizes the solution statements of the given p_set_id
473 -- and highlights it with the keywords (p_text_query)
474 -- using the given tags (p_starttag and p_endtag);
475 -- Statements used in this function should respect solution security.
476 --
477 -- Parameters:
478 -- p_set_id NUMBER, set id
479 -- p_text_query VARCHAR2, keywords used for highlighting
480 -- p_starttag VARCHAR2, start highlighting tag
481 -- p_endtag VARCHAR2, end highlighting tag
482 -- Returns:
483 -- Snippet with the keywords wrapped in tags
484 -- Since R12
485 --
486 FUNCTION get_set_snippet
487 (
488 p_set_id IN NUMBER,
489 p_text_query IN VARCHAR2,
490 p_starttag IN VARCHAR2,
491 p_endtag IN VARCHAR2
492 ) RETURN VARCHAR2
493 IS
494 l_document CLOB; -- the document from which the snippet will be calculated
495 l_data VARCHAR2(32767);
496 l_len NUMBER;
497
498 CURSOR get_element_content(p_set_id NUMBER) IS
499 SELECT e.name, e.description
500 FROM cs_kb_set_eles b, cs_kb_elements_vl e
501 WHERE e.element_id = b.element_id
502 AND b.set_id = p_set_id
503 AND e.status = 'PUBLISHED'
504 AND e.access_level >= Cs_Kb_Security_Pvt.GET_STMT_VISIBILITY_POSITION;
505
506 BEGIN
507 --Synthesize the document out of solution statements
508 dbms_lob.createtemporary(l_document, TRUE, dbms_lob.call);
509
510 FOR element IN get_element_content( p_set_id ) LOOP
511 --append the statement
512 l_data := element.name;
513 IF(l_data IS NOT NULL AND
514 length (l_data) > 0) then
515 dbms_lob.writeappend(l_document, length(l_data)+3, '...' || l_data);
516 END IF;
517
518 --append the description (CLOB)
519 IF(element.description IS NOT NULL AND
520 dbms_lob.getlength(element.description) > 0)
521 THEN
522 dbms_lob.writeappend(l_document, 3, '...');
523 dbms_lob.append(l_document, element.description);
524 END IF;
525
526 END LOOP;
527
528 -- dbms_output.put_line('DOC_LENGTH = ' || dbms_lob.getlength(l_document));
529 -- dbms_output.put_line(substr(l_document,1,255));
530
531 return get_segment_kwic(p_text_query, p_starttag, p_endtag, l_document);
532
533 END get_set_snippet;
534
535
536 end CS_KB_KWIC_UTIL ;