DBA Data[Home] [Help]

PACKAGE BODY: APPS.CS_KB_KWIC_UTIL

Source


1 PACKAGE BODY CS_KB_KWIC_UTIL AS
2 /* $Header: cskbkwicb.pls 120.3 2006/05/18 16:22:00 klou noship $ */
3 
4 /*
5  *
6  * +======================================================================+
7  * |                Copyright (c) 2005 Oracle Corporation                 |
8  * |                   Redwood Shores, California, USA                    |
9  * |                        All rights reserved.                          |
10  * +======================================================================+
11  *
12  *   FILENAME
13  *     cskbkwicb.pls
14  *   PURPOSE
15  *     Creates the package body for CS_KB_KWIC_UTIL
16  *     CS_KB_KWIC_UTIL supports the Keywords In Context implementation
17  *
18  *   HISTORY
19  *   12-APR-2005 HMEI Created.
20  *   11-JUL-2005 HMEI Changed get_segment_kwic to use different regular
21  *                    expressions for pre/post segement. Removed
22  *                    string reverse logic
23  *   12-JUL-2005 HMEI Changed get_segment_kwic to use custom logic for
24  *                    determining pre-segment.  Cannot use a single
25  *                    pattern ending in '$', since performance suffers.
26  *   18-JUN-2006 KLOU Fix bug 5217204
27  *                    Change size of l_kwic_segment to 32000 as the regexp_replace
28  *                    can return up to 32K of content.
29  */
30 
31 
32   -- Maximum keyword segment length
33   MAX_SEGMENT_LENGTH CONSTANT NUMBER := 200;
34   -- Left and Right padding used when calculating KWIC
35   SEGMENT_PADDING CONSTANT NUMBER := MAX_SEGMENT_LENGTH/2;
36 
37   --
38   -- This API highlights all keywords (p_text_query) in a given document
39   -- (p_document) using the specified start and end tags. This API is intended
40   -- for highlighting a short document, e.g. solution title or SR summary.
41   --
42   -- Parameters:
43   --   p_text_query VARCHAR2, keywords used for highlighting
44   --   p_document   VARCHAR2, document to be highlighted
45   --   p_starttag   VARCHAR2, start highlighting tag
46   --   p_endtag     VARCHAR2, end highlighting tag
47   -- Returns:
48   --   The highlighted document
49   -- Since 12.0
50   --
51   FUNCTION highlight_text
52   (
53   p_text_query  IN VARCHAR2,
54   p_document    IN VARCHAR2,
55   p_starttag    IN VARCHAR2,
56   p_endtag      IN VARCHAR2
57   ) RETURN VARCHAR2
58   IS
59     l_result_segment CLOB;
60   BEGIN
61     CTX_DOC.POLICY_MARKUP (
62       policy_name => 'CS_KB_KWIC_POLICY',
63       document => p_document,
64       text_query => p_text_query,
65       starttag => p_starttag,
66       endtag => p_endtag,
67       restab => l_result_segment);
68     return l_result_segment;
69   END highlight_text;
70 
71   --
72   -- This Utility method escapes regular expression operators in a string.
73   -- This is useful if you want to literally match a string that may contain
74   -- a regular expression operator.  (e.g. 'e*trade')
75   --
76   -- Parameters:
77   --   p_string VARCHAR2, the string to escape
78   -- Returns:
79   --   string with regexp special characters escaped.
80   -- Since 12.0
81   --
82   FUNCTION regexp_escape
83   (
84   p_string IN VARCHAR2
85   ) RETURN VARCHAR2
86   IS
87     l_string VARCHAR2(32767); -- the final string after escaping
88     l_special_chars VARCHAR2(100); --chars to escape
89     l_char VARCHAR2(8);
90   BEGIN
91     l_string := p_string;
92     l_special_chars := '\[](){}*+$^?|.'; -- '\' must come first
93 
94     FOR i in 1..length(l_special_chars) LOOP
95       l_char := substr(l_special_chars, i, 1);
96       l_string := replace(l_string, l_char, '\' || l_char);
97     END LOOP;
98 
99     return l_string;
100   END regexp_escape;
101 
102   --
103   -- This API highlights all keywords (p_text_query) in a given document
104   -- (p_document) using the specified start and end tags. It scans the
105   -- document to construct a highlighted segment that contains the most
106   -- number of distinct keywords, with some surrounding context.
107   -- If no keywords found, will return NULL
108   --
109   -- Parameters:
110   --   p_text_query VARCHAR2, keywords used for highlighting
111   --   p_starttag   VARCHAR2, start highlighting tag
112   --   p_endtag     VARCHAR2, end highlighting tag
113   --   p_document   VARCHAR2, document to be highlighted
114   -- Returns:
115   --   The highlighted KWIC (Key Words In Context) segment,
116   --   or NULL if no keywords found.
117   -- Since 12.0
118   --
119   FUNCTION get_segment_kwic
120   (
121   p_text_query  IN VARCHAR2,
122   p_starttag    IN VARCHAR2,
123   p_endtag      IN VARCHAR2,
124   p_document    IN CLOB
125   ) RETURN VARCHAR2
126   IS
127     l_restab CTX_DOC.highlight_tab; -- Keyword match info returned by CTX_DOC
128 
129     TYPE number_table IS TABLE OF INTEGER INDEX BY PLS_INTEGER;
130     -- word occupies the range [open, close)
131     l_open_positions number_table;   -- open positions of matching words
132     l_close_positions number_table;  -- close positions of matching words
133 
134     i INTEGER;
135     idx INTEGER;
136     openIdx INTEGER; -- index for looping through the open positions
137     closeIdx INTEGER; -- index for looping through the close positions
138 
139     -- Map to hold distinct keywords
140     TYPE distinct_word_map IS TABLE OF INTEGER INDEX BY VARCHAR2(32767);
141     l_distinct_words_master distinct_word_map; -- master list of distinct words
142     l_distinct_words distinct_word_map; -- distinct words for a given segment
143 
144     l_word VARCHAR2(32767);
145     l_len INTEGER;
146 
147     --used when calculating the best segment
148     l_word_start INTEGER;
149     l_word_end INTEGER;
150     l_segment_start INTEGER;
151     l_segment_end INTEGER;
152 
153     --represent the best segment
154     l_best_count INTEGER := 0; --best # of distinct keywords in any segment
155     l_best_segment_start INTEGER := null;
156     l_best_segment_end INTEGER := null;
157     l_best_segment VARCHAR2(1000);
158 
159     --pre and post segments (the "Context" part of KWIC)
160     l_pre_segment VARCHAR2(1000);
161     l_post_segment VARCHAR2(1000);
162     l_pre_segment_start INTEGER;
163     l_post_segment_end INTEGER;
164 
165     l_space_positions number_table;  -- used to calculate pre-segement
166     l_pre_pattern VARCHAR2(50); -- regular expression
167     l_post_pattern VARCHAR2(50); -- regular expression
168 
169     -- (5217204)
170     l_kwic_segment VARCHAR2(32000); -- the final kwic segment
171   BEGIN
172 
173     -- positions of matching words returned in l_restab
174     CTX_DOC.POLICY_HIGHLIGHT (
175       policy_name => 'CS_KB_KWIC_POLICY',
176       document => p_document,
177       text_query => p_text_query,
178       restab => l_restab );
179 
180     -- form master list of distinct keywords found in the document
181     i := l_restab.FIRST;
182     WHILE (i IS NOT NULL) LOOP
183       l_open_positions(i) := l_restab(i).offset;
184       l_close_positions(i) := l_restab(i).offset + l_restab(i).length;
185 
186       dbms_lob.read(p_document,
187                     l_restab(i).length,
188                     l_restab(i).offset,
189                     l_word );
190 
191       l_distinct_words_master( LOWER(l_word) ) := 1; --keys are the keywords
192 
193       i := l_restab.NEXT(i);
194     END LOOP;
195 
196     --dbms_output.put_line('# Distinct keywords:' || l_distinct_words_master.COUNT);
197 
198     -- Find the best segment (that with the most distinct keywords)
199     -- Loop through open and close positions and form the largest segment
200     -- possible. For each segment, keep track of distinct keywords.  Note
201     -- each segment begins and ends with a matching keyword
202     openIdx := l_open_positions.FIRST;
203     WHILE (openIdx IS NOT NULL AND
204            l_best_count < l_distinct_words_master.COUNT
205           ) LOOP
206       --segment start and end positions for this iteration
207       l_segment_start := l_open_positions(openIdx);
208       l_segment_end := null;
209 
210       l_distinct_words.DELETE; -- clear out keyword map.
211 
212       closeIdx := openIdx; -- slide the closeIdx to determine the segment close.
213       WHILE (closeIdx IS NOT NULL AND
214              l_close_positions(closeIdx) - l_segment_start <= MAX_SEGMENT_LENGTH
215              ) LOOP
216         l_segment_end := l_close_positions(closeIdx); --current end of segment
217 
218         -- read the current keyword
219         l_word_start := l_open_positions(closeIdx);
220         l_word_end := l_close_positions(closeIdx);
221         l_len := l_word_end - l_word_start;
222         dbms_lob.read(p_document,
223                       l_len,
224                       l_word_start,
225                       l_word );
226         --dbms_output.put_line(l_word_start || ' ' || l_word_end || ' [' || l_word || ']');
227         l_distinct_words( LOWER(l_word) ) := 1; -- add keyword to map
228 
229         closeIdx := l_close_positions.NEXT(closeIdx);
230       END LOOP;
231 
232       -- Note: A Corner Case:
233       -- l_segment_end is null if the keyword length > MAX_SEGMENT_LENGTH
234 
235       if(l_distinct_words.COUNT > l_best_count AND
236          l_segment_end IS NOT NULL)
237       then
238         l_best_count := l_distinct_words.COUNT;
239         l_best_segment_start := l_segment_start;
240         l_best_segment_end := l_segment_end;
241       end if;
242       --dbms_output.put_line( '[' || l_segment_start || ',' || l_segment_end || '] = ' || l_distinct_words.COUNT);
243 
244       if(closeIdx IS NOT NULL AND
245          l_segment_end IS NOT NULL AND
246          l_close_positions(closeIdx) - l_segment_end >= MAX_SEGMENT_LENGTH)
247       then
248         openIdx := l_open_positions.NEXT(closeIdx); --look-ahead optimization
249         --dbms_output.put_line( 'Looking AHEAD');
250       else
251         openIdx := l_open_positions.NEXT(openIdx);
252       end if;
253 
254     END LOOP;
255 
256     -- could not find a best segment
257     if(l_best_segment_start IS NULL) then return NULL; end if;
258 
259     -- now calculate the context surrounding the snippet
260     -- find pre segment start position
261     l_pre_segment_start := l_best_segment_start - SEGMENT_PADDING;
262     if(l_pre_segment_start < 1) then
263       l_pre_segment_start := 1;
264     end if;
265     -- find the post segment end position
266     l_post_segment_end := l_best_segment_end + SEGMENT_PADDING;
267     l_len := dbms_lob.getlength(p_document);
268     if(l_post_segment_end > l_len) then
269       l_post_segment_end := l_len + 1; --last position + 1
270     end if;
271 
272     -- get the middle segment (the one with the densest keywords)
273     l_len := l_best_segment_end - l_best_segment_start;
274     dbms_lob.read(p_document,
275                     l_len,
276                     l_best_segment_start,
277                     l_best_segment );
278 
279     -- now get the prefix segment
280     l_len := l_best_segment_start - l_pre_segment_start;
281     if(l_len > 0) then
282       dbms_lob.read(p_document,
283                     l_len,
284                     l_pre_segment_start,
285                     l_pre_segment );
286     end if;
287 
288     -- now get the postfix segment
289     l_len := l_post_segment_end - l_best_segment_end;
290     if(l_len > 0) then
291       dbms_lob.read(p_document,
292                     l_len,
293                     l_best_segment_end,
294                     l_post_segment );
295     end if;
296 
297 --    dbms_output.put_line('PRE: [' || l_pre_segment || ']');
298 
299     -- Now refine the pre and post segments
300     -- try to match up to 8 words on either side (pre and post)
301 
302     --NOTE: cannot use a single expression ending in '$'.  This
303     -- has terrible performance, probably due to excessive backtracking
304     --l_pre_pattern := '([^[:space:]]+[[:space:]]*){1,8}$'; -- greedy
305     l_pre_pattern := '[[:space:]]+'; -- just match spaces
306 
307     -- Refine the pre segment
308     -- Loop find all the occurences spaces in the pre-segment
309     -- Trim the pre-segment to include the last N words.
310     l_len := 1;
311     i := 0;
312     WHILE(l_len > 0) LOOP
313       l_len := regexp_instr( l_pre_segment,
314                              l_pre_pattern,
315                              l_len,  -- start search at position
316                              1,  -- return the first occurence
317                     	     1   -- return the position of last character + 1
318                              );
319       IF (l_len > 0) THEN
320         l_space_positions(i) := l_len;
321         i := i + 1;
322       END IF;
323 
324     END LOOP;
325 
326     IF( i > 0 ) THEN
327       i := i - 8; -- get up to eight words preceding the main snippet
328       IF( i < 0 ) THEN i := 0; END IF;
329       l_pre_segment := SUBSTR (l_pre_segment, l_space_positions(i));
330     END IF;
331 
332     /*l_len := regexp_instr( l_pre_segment,
333                            l_pre_pattern,
334                            1,  -- start search at position 1
335                            1,  -- return the first occurence
336                     	   0   -- return the position of first char in match
337                            );
338     IF(l_len > 0) THEN
339       l_pre_segment := SUBSTR ( l_pre_segment, l_len );
340     ELSE
341       l_pre_segment := '';
342     END IF;
343     */
344 
345 --    dbms_output.put_line('NEW PRE: [' || l_pre_segment || ']');
346 --    dbms_output.put_line('MID: [' || l_best_segment || ']');
347 --    dbms_output.put_line('POST: [' || l_post_segment || ']');
348 
349     l_post_pattern := '([[:space:]]*[^[:space:]]+){1,8}'; -- greedy
350     -- Refine the post segment
351     l_len := regexp_instr( l_post_segment,
352                            l_post_pattern,
353                            1,  -- start search at position 1
354                            1,  -- return the first occurence
355                            1   -- return the position of last character + 1
356                            );
357     IF(l_len > 0) THEN
358       l_post_segment := SUBSTR ( l_post_segment, 1, l_len - 1 );
359     ELSE
360       l_post_segment := '';
361     END IF;
362 
363 
364 --    dbms_output.put_line('NEW POST: [' || l_post_segment || ']');
365     -- combine the pre, mid, and post to form the KWIC segment
366     l_kwic_segment := l_pre_segment || l_best_segment || l_post_segment;
367 
368     -- surround keywords with the start and end tag, case insensitive
369     l_word := l_distinct_words_master.FIRST;
370     while(l_word IS NOT NULL) LOOP
371       --dbms_output.put_line(l_word || ' ==> ' || regexp_escape(l_word));
375                                        p_starttag || '\1' || p_endtag,
372       --cannot use regular replace(): need case insensitive match/replace
373       l_kwic_segment := regexp_replace(l_kwic_segment,
374                                        '(' || regexp_escape(l_word) || ')',
376                                        1,  --start index
377                                        0,  --occurences (0 means ALL)
378                                        'i' --case insensitive
379                                       );
380       l_word := l_distinct_words_master.NEXT(l_word);
381     END LOOP;
382 
383     return l_kwic_segment;
384   END get_segment_kwic;
385 
386   -- This function synthesizes a snippet from the SR notes of the given p_sr_id
387   -- and highlights it with the keywords (p_text_query)
388   -- using the given tags (p_starttag and p_endtag);
389   -- Note: this function does not include private notes of the SR
390   --
391   -- Parameters:
392   --   p_sr_id      NUMBER, service request id
393   --   p_text_query VARCHAR2, keywords used for highlighting
394   --   p_starttag   VARCHAR2, start highlighting tag
395   --   p_endtag     VARCHAR2, end highlighting tag
396   -- Returns:
397   --   Snippet with the keywords wrapped in tags
398   -- Since R12
399   FUNCTION get_sr_snippet
400   (
401   p_sr_id       IN NUMBER,
402   p_text_query  IN VARCHAR2,
403   p_starttag    IN VARCHAR2,
404   p_endtag      IN VARCHAR2
405   ) RETURN VARCHAR2
406   IS
407     l_document CLOB; -- the document from which the snippet will be calculated
408     l_data VARCHAR2(32767);
409     l_len NUMBER;
410 
411     CURSOR get_sr_notes(c_obj_id NUMBER) IS
412      SELECT notes, notes_detail
413       FROM jtf_notes_vl
414       WHERE source_object_code  = 'SR'
415       AND source_object_id = c_obj_id
416       AND note_status <> 'P'; -- IN ('E', 'I'); ignore private notes
417 
418   BEGIN
419     --Synthesize the document out of SR notes
420     dbms_lob.createtemporary(l_document, TRUE, dbms_lob.call);
421     FOR element IN get_sr_notes( p_sr_id ) LOOP
422       --append the note (VARCHAR)
423       l_data := element.notes;
424       IF(l_data IS NOT NULL AND
425         length (l_data) > 0) then
426         dbms_lob.writeappend(l_document, length(l_data)+3, '...' || l_data);
427       END IF;
428 
429       --append the note detail (CLOB)
430       IF(element.notes_detail IS NOT NULL AND
431          dbms_lob.getlength(element.notes_detail) > 0)
432       THEN
433         dbms_lob.writeappend(l_document, 3, '...');
434         dbms_lob.append(l_document, element.notes_detail);
435       END IF;
436 
437     END LOOP;
438 
439 --    dbms_output.put_line('DOC_LENGTH = ' || dbms_lob.getlength(l_document));
440 --    dbms_output.put_line(substr(l_document,1,255));
441 
442     return get_segment_kwic(p_text_query, p_starttag, p_endtag, l_document);
443 
444   END get_sr_snippet;
445 
446   --
447   -- This function synthesizes the solution statements of the given p_set_id
448   -- and highlights it with the keywords (p_text_query)
449   -- using the given tags (p_starttag and p_endtag);
450   -- Statements used in this function should respect solution security.
451   --
452   -- Parameters:
453   --   p_set_id     NUMBER, set id
454   --   p_text_query VARCHAR2, keywords used for highlighting
455   --   p_starttag   VARCHAR2, start highlighting tag
456   --   p_endtag     VARCHAR2, end highlighting tag
457   -- Returns:
458   --   Snippet with the keywords wrapped in tags
459   -- Since R12
460   --
461   FUNCTION get_set_snippet
462   (
463   p_set_id      IN NUMBER,
464   p_text_query  IN VARCHAR2,
465   p_starttag    IN VARCHAR2,
466   p_endtag      IN VARCHAR2
467   ) RETURN VARCHAR2
468   IS
469     l_document CLOB; -- the document from which the snippet will be calculated
470     l_data VARCHAR2(32767);
471     l_len NUMBER;
472 
473     CURSOR get_element_content(p_set_id NUMBER) IS
474       SELECT e.name, e.description
475 	   FROM cs_kb_set_eles b, cs_kb_elements_vl e
476         WHERE e.element_id = b.element_id
477 		 AND b.set_id = p_set_id
478 		 AND e.status = 'PUBLISHED'
479 		 AND e.access_level >= Cs_Kb_Security_Pvt.GET_STMT_VISIBILITY_POSITION;
480 
481   BEGIN
482     --Synthesize the document out of solution statements
483     dbms_lob.createtemporary(l_document, TRUE, dbms_lob.call);
484 
485     FOR element IN get_element_content( p_set_id ) LOOP
486       --append the statement
487       l_data := element.name;
488       IF(l_data IS NOT NULL AND
489         length (l_data) > 0) then
490         dbms_lob.writeappend(l_document, length(l_data)+3, '...' || l_data);
491       END IF;
492 
493       --append the description (CLOB)
494       IF(element.description IS NOT NULL AND
495          dbms_lob.getlength(element.description) > 0)
496       THEN
497         dbms_lob.writeappend(l_document, 3, '...');
498         dbms_lob.append(l_document, element.description);
499       END IF;
500 
501     END LOOP;
502 
503 --    dbms_output.put_line('DOC_LENGTH = ' || dbms_lob.getlength(l_document));
504 --    dbms_output.put_line(substr(l_document,1,255));
505 
506     return get_segment_kwic(p_text_query, p_starttag, p_endtag, l_document);
507 
508   END get_set_snippet;
509 
510 
511 end CS_KB_KWIC_UTIL  ;