1 PACKAGE BODY IEM_PARSER_PVT as
2 /* $Header: iemparbb.pls 120.3 2005/10/03 15:22:51 appldev noship $*/
3
4 -- TODO:
5 -- add thesaurus processing?
6
7 -- set serverout on size 100000
8
9 -----------------------------------------------------------------
10 -- NOTE: CTXSYS must grant user SELECT on CTX_INDEX_VALUES
11 -- ALSO MUST EXPLICITLY GRANT USER 'CREATE TABLE'
12 -----------------------------------------------------------------
13
14 -- Analyze documents with a view to comparing them
15 -- function COMPUTE_VECTOR calculates the complete set of "phrases"
16 -- for a document, where a phrase is defined a contiguous set of non-stopwords
17 -- of maximum length "window_size" after all punctuation has been removed
18 -- and each word has been "normalized" (reduced to a linguistic stem).
19
20 -- consider the document:
21 -- 'the quick brown fox jumps over the lazy dog'
22 -- where 'the' and 'over' are stopwords.
23 -- if we set our window size to be 2, then the vector will consist of
24 -- "quick" "quick brown" "brown" "brown fox" "fox" "fox jump" "jump"
25 -- "lazy" "lazy dog"
26 -- if window size is set to 3, then we would get
27 -- "quick" "quick brown" "quick brown fox" "brown" "brown fox" "brown fox jump"
28 -- etc.
29
30 -- "word_vectors" are sorted and duplicates removed.
31 -- they may then be compared using the COMPARE_VECTOR function. This counts
32 -- the number of matching phrases in each vector, and divides them by the total
33 -- number of words to give a percentage match. Identical documents will always
34 -- score 100%. As many words will NOT be a match, a score of 10% or greater
35 -- generally represents an excellent match.
36
37 -- the procedure TEST may be used to experiment with scores.
38
39 -- the procedure P_THEMES may be used in place of a call to CTX_DOC.THEMES,
40 -- as it uses the same interface. Note, however, that many more terms will be
41 -- returned in the theme_table than the typical 16 or 32 returned by
42 -- CTX_DOC.THEMES. If the output of this function is used in a CONTAINS query,
43 -- then each term should be prefixed by the $ stemming operator - ie. use
44 -- $(term1), $(term2) instead of ABOUT(term1), ABOUT(term2)
45
46 -- Package variables
47 /*
48 ec_initialized boolean := false;
49 wildCard boolean := false;
50 wildFirst boolean := false;
51
52 type stoplist_type is record (
53 index_name varchar2(30),
54 stopwords word_vector );
55
56 -- stoplist changes each time we process a new index, but is cached between
57 stoplist stoplist_type;
58
59 name_already_used exception;
60 PRAGMA EXCEPTION_INIT(name_already_used, -955);
61
62 imt_error exception;
63 PRAGMA EXCEPTION_INIT(imt_error, -20000);
64
65 -----------------------------------------------------------------
66 -- The following procedure is the exposed API for the
67 -- Email body parser
68 --
69 -- Input paramters are:
70 -- p_message_id number(15) -- Required: Message Id from iem_ms_msgbodys;
71 -- p_search_str varchar2(4000) -- Required: Input email body search string
72 -- p_idx_name varchar2 -- Optional Oracle Text Index name,
73 -- p_analyze_length integer -- default 4000,
74 --
75 -- TO DO
76 -- Maxlen should be max of 4000 ?
77 -----------------------------------------------------------------
78 function start_parser (
79 p_message_id number,
80 p_search_str varchar2,
81 p_idx_name varchar2,
82 p_analyze_length integer
83 ) return word_vector is
84
85 l_window_size integer := 0;
86 l_mail_body varchar2(4000) := null;
87 l_error_message varchar2(200);
88 l_count integer := 0;
89 l_maskLength integer := 0;
90 l_maxlen integer := 0;
91 l_idx_name varchar2(50);
92 l_search_str varchar2(4000);
93 l_vec1 word_vector;
94 l_vec2 word_vector;
95 l_return_vec word_vector;
96 INVALID_MAIL_BODY EXCEPTION;
97 INVALID_MESSAGE_ID EXCEPTION;
98 INVALID_SEARCH_STR EXCEPTION;
99 cursor getmsgbody IS
100 SELECT VALUE from IEM_MS_MSGBODYS where p_message_id = message_id;
101
102 BEGIN
103 l_maskLength := length(p_search_str);
104 l_return_vec := word_vector(); -- initialize
105 --dbms_output.put_line('p_message_id: ' || p_message_id);
106 --dbms_output.put_line('p_search_str: ' || p_search_str);
107 --dbms_output.put_line('p_idx_name: ' || p_idx_name);
108 --dbms_output.put_line('p_analyze_length: ' || p_analyze_length);
109 --dbms_output.put_line('-- Passed in values --------');
110
111 -- Check for null values in passed parameters
112 if p_message_id is null then
113 RAISE INVALID_MESSAGE_ID;
114 end if;
115 if p_search_str is null then
116 RAISE INVALID_SEARCH_STR;
117 end if;
118 l_idx_name := p_idx_name;
119 if l_idx_name is null then
120 l_idx_name := 'emc_idx';
121 end if;
122 if p_analyze_length is null then
123 l_maxlen := 4000;
124 else
125 l_maxlen := p_analyze_length;
126 end if;
127
128 l_maxlen := least(l_maxlen, 32767);
129 l_search_str := p_search_str;
130 if length(l_search_str) > l_maxlen then
131 l_search_str := substr(l_search_str, 1, l_maxlen);
132 end if;
133
134 l_window_size := IEM_PARSER_PVT.get_window_size(l_search_str);
135
136 l_count := instr(l_search_str,'%');
137 if l_count > 0 then
138 if l_count = 1 then
139 wildFirst := true;
140 elsif l_count = l_maskLength then
141 wildCard := true;
142 end if;
143 end if;
144
145 -- Get message body from IEM_MS_MSGBODYS
146 open getmsgbody;
147 fetch getmsgbody INTO l_mail_body;
148 if l_mail_body is null then
149 RAISE INVALID_MAIL_BODY;
150 end if;
151 --dbms_output.put_line('l_mail_body: ' || l_mail_body);
152 --dbms_output.put_line('-- l_mail_body --------');
153
154 --dbms_output.put_line('l_search_str: ' || l_search_str);
155 --dbms_output.put_line('-- Pass l_search_str to compute_vector --------');
156
157 -- Compute Vector for search string
158 l_vec1 := IEM_PARSER_PVT.compute_vector(
159 idx_name => l_idx_name,
160 document => l_search_str,
161 window_size => l_window_size);
162
163 --IEM_PARSER_PVT.dump_word_vector(l_vec1);
164 --dbms_output.put_line('-- dump_word_vector for l_search_str --------');
165
166 -- Compute Vector for message body
167 l_vec2 := IEM_PARSER_PVT.compute_vector(
168 idx_name => l_idx_name,
169 document => l_mail_body,
170 window_size => l_window_size);
171
172 --IEM_PARSER_PVT.dump_word_vector(l_vec2);
173
174 --dbms_output.put_line('--- dump_word_vector for Mail Body --------------------------');
175
176 --l_sim := IEM_PARSER_PVT.compare_vectors(l_vec1, l_vec2);
177
178 -- filp for wild card compare
179 l_return_vec := IEM_PARSER_PVT.compare_vectors(l_vec2, l_vec1);
180
181 --IEM_PARSER_PVT.dump_word_vector(l_return_vec);
182
183 --dbms_output.put_line('------ dump_word_vector for Return_vec in start_parser ---------------');
184
185 if l_return_vec is null then
186 l_return_vec.extend;
187 l_return_vec(1) := 'Error, no return values.';
188 return l_return_vec;
189 else
190 return l_return_vec;
191 end if;
192 EXCEPTION
193 WHEN OTHERS THEN
194 return l_return_vec;
195 END start_parser;
196
197 function get_stoplist (idx_name varchar2)
198 return word_vector is
199 retlist word_vector;
200 cursor c1 is
201 select upper(ixv_value) wrd
202 from ctxsys.ctx_index_values
203 where ( ixv_index_name = upper ( idx_name )
204 or ( ixv_index_owner||'.'||ixv_index_name = upper ( idx_name ))
205 )
206 and ixv_attribute = 'STOP_WORD';
207 begin
208 open c1;
209 fetch c1 bulk collect into retlist;
210 return retlist;
211 end;
212
213 -- init : creates the temporary table for explain if it doesn't exist
214 -- and loads stopword list if not already loaded for this index
215
216 procedure init (idx_name varchar2) is
217 sl word_vector;
218 username varchar2(30);
219 begin
220
221 -- calculate the stoplist for this table unless already cached
222
223 if stoplist.index_name is null or stoplist.index_name <> idx_name then
224 stoplist.index_name := idx_name;
225 stoplist.stopwords := get_stoplist(idx_name);
226 end if;
227
228 if ec_initialized <> true then
229
230 begin
231 execute immediate (
232 'create global temporary table ec_ana_explain ' ||
233 ' ( ' ||
234 ' explain_id varchar2(30), ' ||
235 ' id number, ' ||
236 ' parent_id number, ' ||
237 ' operation varchar2(30), ' ||
238 ' options varchar2(30), ' ||
239 ' object_name varchar2(64), ' ||
240 ' position number, ' ||
241 ' cardinality number ' ||
242 ' )');
243 exception
244 when name_already_used then null;
245 end;
246
247 begin
248 execute immediate (
249 'create table ec_ana_roots ' ||
250 ' ( ' ||
251 ' token varchar2(64), ' ||
252 ' root varchar2(64) ' ||
253 ' )');
254 execute immediate (
255 'create index ec_ana_roots_index '||
256 ' on ec_ana_roots(token)');
257 exception
258 when name_already_used then null;
259 end;
260
261 ec_initialized := true;
262
263 end if;
264
265 end;
266
267 function is_a_stopword (wrd varchar2)
268 return boolean is
269 l_count integer :=0;
270 begin
271 for i in 1 .. stoplist.stopwords.count loop
272 if wrd = stoplist.stopwords(i) then
273 return true;
274 end if;
275 end loop;
276 --dbms_output.put_line (' In is_a_stopword, wrd = '|| wrd);
277 return false;
278 end;
279
280 procedure dump_word_vector (inlist in word_vector) is
281 i integer;
282 begin
283 for i in 1 .. inlist.count loop
284 dbms_output.put_line('token: '|| inlist(i));
285 end loop;
286 end;
287
288 procedure move_item (inlist in out NOCOPY word_vector, frm integer, too integer) is
289 tmp varchar2(2000);
290 i integer;
291 begin
292 tmp := inlist(frm);
293 for i in reverse too+1 .. frm loop
294 inlist(i) := inlist(i-1);
295 end loop;
296 inlist(too) := tmp;
297 end;
298
299 -- insertion sort of word_vector
300
301 function sort_list (inlist in word_vector, dedupe boolean)
302 return word_vector is
303 retlist word_vector;
304 m integer;
305 n integer;
306 maxi integer; -- max offset in array
307 begin
308 retlist := inlist;
309
310 if retlist.count <= 1 then
311 return retlist;
312 end if;
313
314 for i in 2 .. retlist.count loop
315 for k in 1 .. i loop
316 if retlist(i) < retlist(k) then
317 move_item(retlist, i, k);
318 exit;
319 end if;
320 end loop;
321 end loop;
322
323 if dedupe then
324 m := 1;
325 maxi := retlist.count;
326 while m < maxi loop
327 if retlist(m) = retlist(m+1) then
328 for n in m .. maxi-1 loop
329 retlist(n) := retlist(n+1);
330 end loop;
331 retlist.delete(maxi);
332 maxi := maxi - 1;
333 else
334 m := m + 1;
335 end if;
336 end loop;
337 end if;
338
339 return retlist;
340 end;
341
342 -- call with indexname and word to get the root of
343
344 function get_root_no_cache (idx_name varchar2, term varchar2)
345 return varchar2 is
346 exid number;
347 retval varchar2(2000);
348 begin
349
350 -- dbms_output.put_line('root for '||term);
351 begin
352 ctx_query.explain (
353 index_name => idx_name,
354 text_query => '$'||term,
355 explain_table => 'ec_ana_explain',
356 sharelevel => 0, -- do share
357 explain_id => 1 );
358 exception
359 when imt_error then
360 return term;
361 end;
362 begin
363 execute immediate (
364 ' select object_name from ec_ana_explain ' ||
365 ' where explain_id = 1 ' ||
366 ' and position = 1 ' ||
367 ' and parent_id = 1' ) into retval;
368 exception when no_data_found then
369 retval := term;
370 end;
371
372 execute immediate ('truncate table ec_ana_explain');
373
374 -- dbms_output.put_line('is '||retval);
375 return retval;
376
377 end;
378
379
380 function get_root (idx_name varchar2, term varchar2)
381 return varchar2 is
382 exid number;
383 retval varchar2(2000);
384
385 begin
386
387 begin
388 execute immediate (
389 'select root from ec_ana_roots where token = :t1')
390 into retval using term;
391
392 exception when no_data_found then
393 begin
394 ctx_query.explain (
395 index_name => idx_name,
396 text_query => '$'||term,
397 explain_table => 'ec_ana_explain',
398 sharelevel => 0, -- do share
399 explain_id => 1 );
400 exception
401 when imt_error then
402 execute immediate('insert into ec_ana_roots (token, root) values (:t1, :t2)')
403 using term, retval;
404 return term;
405 end;
406
407 begin
408 execute immediate (
409 ' select object_name from ec_ana_explain ' ||
410 ' where explain_id = 1 ' ||
411 ' and position = 1 ' ||
412 ' and parent_id = 1' ) into retval;
413 exception when no_data_found then
414 retval := term;
418 execute immediate('insert into ec_ana_roots (token, root) values (:t1, :t2)')
415 end;
416 execute immediate ('truncate table ec_ana_explain');
417
419 using term, retval;
420
421 return retval;
422
423 end;
424
425 return retval;
426
427 end;
428
429 function normalize_list (idx_name varchar2, inlist in word_vector)
430 return word_vector is
431 retlist word_vector;
432 begin
433 retlist := word_vector();
434 for i in 1 .. inlist.count loop
435 retlist.extend;
436 if is_a_stopword(inlist(i)) then
437 retlist(i) := inlist(i);
438 else
439 retlist(i) := get_root(idx_name, inlist(i));
440 end if;
441 end loop;
442 return retlist;
443 end;
444
445 function remove_stopwords (inlist in word_vector)
446 return word_vector is
447 cntr integer := 1;
448 retlist word_vector;
449 begin
450 retlist := word_vector();
451 for i in 1 .. inlist.count loop
452 if (not (is_a_stopword(inlist(i))) ) then
453 retlist.extend;
454 retlist(cntr) := inlist(i);
455 cntr := cntr + 1;
456 end if;
457 end loop;
458 return retlist;
459 end;
460
461 -- get phrases
462
463 -- from a list of words, produce a list of phrases, defined as any
464 -- contiguous list of non-stopwords, up to max_words in length.
465
466 function get_phrases (inlist word_vector, max_words integer)
467 return word_vector is
468 p integer;
469 phrase varchar2(2000);
470 phrase1 varchar2(2000);
471 space varchar2(1);
472 cntr integer := 1;
473 inlistCount integer := 0;
474 mw integer := 0;
475 retlist word_vector;
476 newInlist word_vector;
477
478 begin
479 inlistCount:= inlist.count;
480 mw := max_words;
481 retlist := word_vector();
482 -- newInlist := word_vector();
483 newInlist := inlist;
484
485 newInlist := remove_stopwords(newInlist);
486
487 for i in 1 .. newInlist.count loop
488 --if not is_a_stopword(inlist(i)) then
489 space := '';
490 phrase := '';
491 p := 0;
492 while (p <= mw-1 and i+p <= newInlist.count) loop
493 --if (wildCard) OR (wildFirst) then -- Check for and remove all wild card character from the phrase
494 -- dbms_output.put_line('in get_phrases wild = true');
495 -- phrase1 := IEM_PARSER_PVT.remove_wild(newInlist(i+p));
496 -- phrase := phrase || space || phrase1 ;
497 --else
498 phrase := phrase || space || newInlist(i+p);
499 --end if;
500 if p = mw-1 then
501 retlist.extend;
502 retlist(cntr) := phrase;
503 --dbms_output.put_line('phrase: '|| phrase);
504 --dbms_output.put_line('');
505 cntr := cntr + 1;
506 end if;
507 space := ' ';
508 p := p+1;
509 end loop;
510 --end if;
511 end loop;
512 return retlist;
513 end;
514
515 function parse_string (stringToParse in varchar2)
516 return word_vector is
517 ws varchar2(32767); -- workspace
518 theword varchar2(2000);
519 p integer;
520 cntr integer := 1;
521 rettab word_vector;
522
523 c integer := 0;
524
525 begin
526
527 rettab := word_vector(); -- initialize
528
529 ws := stringToParse;
530
531 -- translate all punctuation into spaces
532
533 -- ws := translate (ws, '`!"$%^&*()_+{}:@~|<>?-=[];''#\,./',
534 -- ' ');
535 -- Don't remove %,@,.,#,?,'and comma chars, use to denote wild card or email address.
536 -- If changed, be sure to add to the is_num function
537
538 -- Commited out for R12 patch 4417790 issue
539
540 --ws := translate (ws, '`!"$^&*()_+{}~|<>-=;\,/',
541 ' ');
542
543 -- and whitespace control chars
544 ws := translate (ws, fnd_global.local_chr(10)||fnd_global.local_chr(11)||fnd_global.local_chr(12)||fnd_global.local_chr(13), ' ');
545
546 -- upper case it all
547 ws := translate (ws, 'abcdefghijklmnopqrstuvwxyz',
548 'ABCDEFGHIJKLMNOPQRSTUVWXYZ');
549
550 -- remove multiple spaces
551 while (instr(ws, ' ', 1) > 0) loop
552 ws := replace (ws, ' ', ' ');
553 end loop;
554
555 -- and any leading or trailing space
556 if (substr(ws, 1, 1) = ' ') then
557 ws := substr(ws, 2, length(ws)-1);
558 end if;
559
560 if (substr(ws, length(ws), 1) = ' ') then
561 ws := substr(ws, 1, length(ws)-1);
562 end if;
563
564 theword := substr(ws, 280, 40);
565
566 p := instr(ws, ' ');
567
568 while p > 0 loop
569
570 c := c+1;
571
572 theword := substr(ws, 1, p-1);
573
574 -- save word. Simply discard it if too long
575 if length(theword) <= MAXWORDLENGTH then
576 rettab.extend;
577 rettab(cntr) := theword;
578 cntr := cntr + 1;
579 end if;
580
581 ws := substr(ws, p+1, length(ws)-p);
582 p := instr(ws, ' ');
583
584 end loop;
585
586 theword := ws;
587 if length(theword) > 0 then
588 rettab.extend;
589 rettab(cntr) := theword;
590 end if;
591
592 return rettab;
593 end;
594
595 -- compute the word vector
599 document in varchar2,
596
597 function compute_vector (
598 idx_name in varchar2, --DEFAULT fnd_api.g_miss_char,
600 analyze_length in integer default 5000,
601 window_size in integer default 3
602 )
603 return word_vector is
604 maxlen integer;
605 ws integer;
606 thedoc varchar2(32767);
607 retlist word_vector;
608 t1 number; t2 number; t3 number; t4 number; t5 number;
609 begin
610
611 if analyze_length is null then
612 maxlen := 5000;
613 else
614 maxlen := analyze_length;
615 end if;
616
617 maxlen := least(maxlen, 32767);
618
619 if window_size is null then
620 ws := 3;
621 else
622 ws := window_size;
623 end if;
624
625 if length(document) > maxlen then
626 thedoc := substr(document, 1, maxlen);
627 else
628 thedoc := document;
629 end if;
630
631 init(idx_name);
632
633 --t1 := dbms_utility.get_time;
634
635 retlist := parse_string(thedoc);
636 --dbms_output.put_line ('This is retlist after parse_string :'|| retlist(1));
637
638 --t2 := dbms_utility.get_time;
639 --dbms_output.put_line('parse ' || to_char((t2-t1)/100));
640
641 --retlist := normalize_list ( idx_name, retlist );
642
643 --t3 := dbms_utility.get_time;
644 -- dbms_output.put_line('Normalise ' || to_char((t3-t2)/100));
645
646 --dbms_output.put_line ('Just before get_phrases');
647 --dbms_output.put_line ('This is retlist.count :'|| retlist.count);
648
649 retlist := get_phrases ( retlist, ws );
650
651 --dbms_output.put_line ('Just after get_phrases');
652 --dbms_output.put_line ('This is retlist.count :'|| retlist.count);
653
654 --t4 := dbms_utility.get_time;
655 --dbms_output.put_line('phrase ' || to_char((t4-t3)/100));
656
657 -- retlist := sort_list ( retlist, true );
658 retlist := sort_list ( retlist, false ); -- Don't remove duplicates
659
660 --dbms_output.put_line ('This is retlist after sort :'|| retlist);
661 --for i in 1 .. retlist.count loop
662 --dbms_output.put_line ('This is retlist after sort :'|| retlist(i));
663 --end loop;
664
665 --t5 := dbms_utility.get_time;
666 --dbms_output.put_line('Normalise ' || to_char((t3-t2)/100)||' sort ' || to_char((t5-t4)/100));
667
668 return retlist;
669 end;
670
671 -- same function for clobs
672
673 function compute_vector (
674 idx_name in varchar2,
675 document in clob,
676 analyze_length in integer default 5000,
677 window_size in integer default 3
678 )
679 return word_vector is
680 retlist word_vector;
681 maxlen integer;
682 thestrng varchar2(32767);
683 theoffset integer;
684 thesize integer;
685 thebuff varchar2(32767);
686 begin
687
688 if analyze_length is null then
689 maxlen := 5000;
690 else
691 maxlen := analyze_length;
692 end if;
693
694 maxlen := least(maxlen, 32767);
695
696 theoffset := 1;
697 thestrng := ' ';
698 while (true) loop
699
700 begin
701 thesize := maxlen;
702 dbms_lob.read(
703 lob_loc => document,
704 amount => thesize,
705 offset => theoffset,
706 buffer => thebuff );
707 exception
708 when no_data_found then
709 exit;
710 end;
711
712 exit when (thesize <= 0);
713 theoffset := thesize+1;
714
715 if length(thestrng) + thesize < maxlen then
716 thestrng := thestrng || thebuff;
717 else
718 thestrng := thestrng || substr(thebuff, 1, maxlen-length(thestrng)-1);
719 exit;
720 end if;
721
722 end loop;
723
724 if length(thestrng) + thesize < maxlen then
725 thestrng := thestrng || thebuff;
726 else
727 thestrng := thestrng || substr(thebuff, 1, maxlen-length(thestrng)-1);
728 end if;
729
730 retlist := compute_vector (idx_name, thestrng, maxlen, window_size);
731
732 return retlist;
733 end;
734
735 -- compare two lists of phrases
736 -- resulting similarity quotient is based on number of shared phrases,
737 -- reduced according to length of each vector.
738
739 function compare_vectors (inlist1 in word_vector, inlist2 in word_vector)
740 return word_vector is
741 result_array word_vector;
742 matchcnt integer := 0;
743 j integer := 1;
744 k integer := 1;
745 l_Error_Message varchar2(200);
746 INVALID_WORD_VECTOR EXCEPTION;
747 begin
748
749 result_array := word_vector(); -- initialize
750
751
752 if inlist1 is null or inlist2 is null then
753 RAISE INVALID_WORD_VECTOR;
754 end if;
755
756 if inlist1.count = 0 or inlist2.count = 0 then
757 RAISE INVALID_WORD_VECTOR;
758 end if;
759
760
761 while (j <= inlist1.count and k <= inlist2.count) loop
762 while (k <= inlist2.count and j <= inlist1.count) loop
763
764 -- Don't count the wild # as a direct match in string when it is not quoted
765 if (inlist1(j) = inlist2(k)) and not (instr(inlist2(k), '#') > 0) then
766 --dbms_output.put_line('***** match on in compare vector ***** '|| inlist1(j));
767 matchcnt := matchcnt + 1;
768 --dbms_output.put_line('***** matchcnt ***** '|| matchcnt);
769 result_array.extend;
770 result_array(matchcnt) := inlist1(j);
774 elsif (instr(inlist2(k), '%') > 0) then
771 --dbms_output.put_line('***** result_array(matchcnt) ***** '|| result_array(matchcnt));
772 j := j+1;
773 exit;
775 --dbms_output.put_line('---------------------------');
776 --dbms_output.put_line('---------------------------');
777 --dbms_output.put_line('At wild % in compare_vec');
778 --dbms_output.put_line('This is inlist1(j)' || inlist1(j));
779 --dbms_output.put_line('This is inlist2(k)' || inlist2(k));
780 if (test_match(inlist2(k), inlist1(j))) then
781 --dbms_output.put_line('match on % in compare_vectors '|| inlist1(j));
782 matchcnt := matchcnt + 1; -- update match count and move on
783 result_array.extend;
784 result_array(matchcnt) := inlist1(j);
785 j := j+1;
786 exit;
787 else -- just move to the next word
788 j := j+1;
789 exit;
790 end if;
791 elsif (instr(inlist2(k), '#') > 0) then
792 --dbms_output.put_line('---------------------------');
793 --dbms_output.put_line('---------------------------');
794 --dbms_output.put_line('At wild # in compare_vec');
795 --dbms_output.put_line('This is inlist1(j)' || inlist1(j));
796 --dbms_output.put_line('This is inlist2(k)' || inlist2(k));
797 if (test_match_single(inlist2(k), inlist1(j))) then
798 --dbms_output.put_line('match on is_num in compare_vectors '|| inlist1(j));
799 matchcnt := matchcnt + 1; -- update match count and move on
800 result_array.extend;
801 result_array(matchcnt) := inlist1(j);
802 j := j+1;
803 exit;
804 else -- just move to the next word
805 j := j+1;
806 exit;
807 end if;
808 elsif (instr(inlist2(k), '?') > 0) then
809 --dbms_output.put_line('---------------------------');
810 --dbms_output.put_line('---------------------------');
811 --dbms_output.put_line('At wild ? in compare_vec');
812 --dbms_output.put_line('This is inlist1(j)' || inlist1(j));
813 --dbms_output.put_line('This is inlist2(k)' || inlist2(k));
814 if (test_match_single(inlist2(k), inlist1(j))) then
815 --dbms_output.put_line('match on NOT is_num in compare_vectors '|| inlist1(j));
816 matchcnt := matchcnt + 1; -- update match count and move on
817 result_array.extend;
818 result_array(matchcnt) := inlist1(j);
819 j := j+1;
820 exit;
821 else -- just move to the next word
822 j := j+1;
823 exit;
824 end if;
825 elsif (inlist2(k) > inlist1(j)) or (wildCard) then
826 j := j+1;
827 --dbms_output.put_line('j+1 is at k > j: '|| j);
828 --dbms_output.put_line('This is inlist1(j) at k > j: ' || inlist1(j));
829 --dbms_output.put_line('This is inlist2(k) at k > j: ' || inlist2(k));
830 exit;
831 end if;
832 k := k+1;
833 end loop;
834 end loop;
835 --dbms_output.put_line('Matches : '||to_char(matchcnt));
836 --dbms_output.put_line('Word Count: '||to_char(inlist1.count+inlist2.count));
837 --dbms_output.put_line('Mail body word count: '||to_char(inlist1.count));
838 --dbms_output.put_line('sqrt : '||to_char(sqrt(sqrt(inlist1.count+inlist2.count))));
839 --dbms_output.put_line(inlist1.count+inlist2.count);
840 wildCard := false; -- reset global variable
841 --dbms_output.put_line (' --------- wildCard set to false ----------- ');
842 --return (matchcnt*1.0)/sqrt(sqrt((inlist1.count+inlist2.count)))*100;
843 --return ((matchcnt*1.0)/(inlist1.count))*100;
844 --return ((matchcnt*1.0)/(inlist2.count))*100;
845 return result_array;
846 EXCEPTION
847 WHEN INVALID_WORD_VECTOR THEN
848 FND_MESSAGE.SET_NAME('IEM','IEM_INVALID_WORD_VECTOR');
849 l_Error_Message := FND_MESSAGE.GET;
850 fnd_file.put_line(fnd_file.log, l_Error_Message);
851
852 end;
853
854 procedure p_themes (
855 index_name in varchar2,
856 -- textkey in varchar2,
857 -- restab in out ctx_doc.theme_tab,
858 restab in out NOCOPY iem_im_wrappers_pvt.theme_table,
859 full_themes in boolean default true,
860 num_themes in number default 32,
861 document in varchar2,
862 pictograph in boolean default false
863 )
864 is
865 -- thedoc clob;
866 wv word_vector;
867 begin
868
869
870 -- ctx_doc.filter(
871 -- index_name => index_name,
872 -- textkey => textkey,
873 -- restab => thedoc,
874 -- plaintext => true );
875
876 wv := IEM_PARSER_PVT.compute_vector(
877 -- idx_name => 'acr_resp_index',
878 idx_name => index_name,
879 document => document,
880 analyze_length => DOCLENGTH,
881 window_size => WINDOWSIZE );
882
883 for i in 1 .. wv.count loop
884 IF wv(i) <>' ' then
885 restab(i).theme := wv(i);
886 restab(i).weight := 1;
887 END IF;
888 end loop;
889
890 end;
891
892 function get_window_size (search_string varchar2)
893 return integer is
894
895 INVALID_LOB EXCEPTION;
896 ws varchar2(32767); -- workspace
897 theword varchar2(2000);
898 p integer;
899 cntr integer := 1;
900 rettab word_vector;
901 window_size integer := 0;
902
903 BEGIN
904
905 -- Find the window size/number of words in the search phrase
906
910 ws := replace (ws, ' ', ' ');
907 ws := search_string;
908 -- remove multiple spaces
909 while (instr(ws, ' ', 1) > 0) loop
911 end loop;
912
913 -- and any leading or trailing space
914 if (substr(ws, 1, 1) = ' ') then
915 ws := substr(ws, 2, length(ws)-1);
916 end if;
917
918 if (substr(ws, length(ws), 1) = ' ') then
919 ws := substr(ws, 1, length(ws)-1);
920 end if;
921
922 theword := substr(ws, 280, 40);
923
924 p := instr(ws, ' ');
925
926 while p > 0 loop
927 window_size := window_size+1;
928
929 ws := substr(ws, p+1, length(ws)-p);
930
931 p := instr(ws, ' ');
932
933 end loop;
934 window_size := window_size+1; -- Final window/word count
935 return window_size;
936 END; -- get_window_size
937
938 function remove_wild (mask varchar2)
939 return varchar2 is
940
941 newMask varchar2(2000);
942 l_count integer := 0;
943 maskLength integer := 0;
944
945 BEGIN
946 l_count := instr(mask, '%');
947 maskLength := length(mask);
948 if l_count > 1 then -- % is at the end of the word
949 newMask := substr(mask, 1, maskLength -1); -- Remove % form the wnd of the mask
950 return newMask;
951 elsif l_count = 1 then -- % is at the begining of the word
952 newMask := substr(mask, 2, maskLength -1);
953 return newMask;
954 end if;
955 return mask;
956 END;
957
958 function test_match (mask varchar2, testToken varchar2)
959 return boolean is
960
961 maskLength integer := 0;
962 testTokenLength integer := 0;
963
964 matchcnt integer := 0;
965 m integer := 1;
966 n integer := 1;
967 literalChar boolean := false;
968
969 BEGIN
970
971 maskLength := length(mask);
972 testTokenLength := length(testToken);
973
974 if mask is null or testToken is null then
975 return false;
976 end if;
977
978 if maskLength = 0 or testTokenLength = 0 then
979 return false;
980 end if;
981
982 while (m <= maskLength and n <= testTokenLength) loop
983 --dbms_output.put_line('-------------------------------------');
984 --dbms_output.put_line('substr(mask,m,1) '|| substr(mask,m,1));
985 --dbms_output.put_line('substr(testToken,n,1) '|| substr(testToken,n,1));
986
987 if (substr(mask,m,1) = fnd_global.local_chr(39)) then
988 --dbms_output.put_line('At chr(39)');
989 m := m+1; -- move to the next character in the mask
990 --dbms_output.put_line('substr(mask,m,1) '|| substr(mask,m,1));
991 --dbms_output.put_line('=-=-=-=-=-=-=-=-=-=-=-=-=-=');
992 matchcnt := matchcnt + 1; -- count this as a match.
993 if (literalChar = false) then
994 literalChar := true; --flip the literal character flag
995 else
996 literalChar := false;
997 end if;
998 elsif (substr(mask,m,1) = '#' AND NOT literalChar) then -- We need to see if the current testToken char is anumber
999 if (is_num(substr(testToken,n,1))) then
1000 m := m+1; -- move to the next character in the mask
1001 matchcnt := matchcnt + 1; -- this is a match.
1002 n := n+1; -- move to the next testToken char
1003 --dbms_output.put_line('(substr(mask,m,1) TRUE '|| substr(mask,m,1));
1004 else
1005 --dbms_output.put_line('substr(mask,m,1) FALSE '|| substr(mask,m,1));
1006 return false; -- This is not a match for this token
1007 end if;
1008 elsif (substr(mask,m,1) = '?' AND NOT literalChar) then -- skip this in the mask and count it as a
1009 if (NOT(is_num(substr(testToken,n,1)))) then
1010 m := m+1; -- move to the next character in the mask
1011 matchcnt := matchcnt + 1; -- this is a match
1012 n := n+1; -- move to the next testToken char
1013 else
1014 return false; -- This is not a match for this token
1015 end if;
1016 elsif (substr(mask,m,1) = '%' AND NOT literalChar) then -- test if testToken has a match for then next mask character
1017 -- anywhere in it's string.
1018 m := m+1; -- move to the next character in the mask
1019 matchcnt := matchcnt + 1; -- have the wild card march.
1020 n := n+1; -- The % will match any single character in the testToken string
1021
1022 while (m <= maskLength and n <= testTokenLength) loop
1023 if substr(mask,m,1) = substr(testToken,n,1) then
1024 --dbms_output.put_line('match on new and improved Wild '|| substr(testToken,n,1));
1025 matchcnt := matchcnt + 1;
1026 m := m+1;
1027 n := n+1;
1028 else
1029 -- Need to take into account the single quote
1030 if (substr(mask,m,1) = '%' OR substr(mask,m,1) = '#' OR substr(mask,m,1) = '?' OR substr(mask,m,1) = fnd_global.local_chr(39)) then
1031 exit; -- leave inner loop and continue processing strings
1032 else
1033 n :=n+1; -- chars don't match so move n along
1034 end if;
1035 end if;
1036 end loop;
1037 elsif substr(mask,m,1) = substr(testToken,n,1) then
1038 --dbms_output.put_line('match on from new test_match: '|| substr(testToken,n,1));
1039 matchcnt := matchcnt + 1; -- move on in mask and testToken
1040 m := m+1;
1041 n := n+1;
1042 else -- not a match so return false
1043 --dbms_output.put_line('At last else -- returnig false at New test_match');
1044 return false;
1045 end if;
1049
1046 end loop;
1047
1048 --dbms_output.put_line('matchLength and matchcnt '|| maskLength ||' '|| matchcnt);
1050 if maskLength = matchcnt then
1051 return true;
1052 else
1053 return false;
1054 end if;
1055 END; -- test_match
1056
1057 function test_match_single (mask varchar2, testToken varchar2)
1058 return boolean is
1059
1060 maskLength integer := 0;
1061 testTokenLength integer := 0;
1062
1063 matchcnt integer := 0;
1064 m integer := 1;
1065 n integer := 1;
1066 literalChar boolean := false;
1067
1068 BEGIN
1069
1070 maskLength := length(mask);
1071 testTokenLength := length(testToken);
1072
1073 if mask is null or testToken is null then
1074 return false;
1075 end if;
1076
1077 if maskLength = 0 or testTokenLength = 0 then
1078 return false;
1079 end if;
1080
1081 while (m <= maskLength and n <= testTokenLength) loop
1082 --dbms_output.put_line('-------------------------------------');
1083 --dbms_output.put_line('substr(mask,m,1) '|| substr(mask,m,1));
1084 --dbms_output.put_line('substr(testToken,n,1) '|| substr(testToken,n,1));
1085
1086 if (substr(mask,m,1) = fnd_global.local_chr(39)) then
1087 --dbms_output.put_line('At chr(39)');
1088 m := m+1; -- move to the next character in the mask
1089 --dbms_output.put_line('substr(mask,m,1) '|| substr(mask,m,1));
1090 --dbms_output.put_line('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<');
1091 if (literalChar = false) then
1092 literalChar := true; --flip the literal character flag
1093 else
1094 literalChar := false;
1095 end if;
1096
1097 elsif( substr(mask,m,1) = '#' AND NOT literalChar) then -- test if testToken has an number in it's corresponding position
1098 if (is_num(substr(testToken,n,1))) then
1099 --dbms_output.put_line('match on from is_num: TRUE '|| substr(testToken,n,1));
1100 matchcnt := matchcnt + 1;
1101 m := m+1;
1102 n := n+1;
1103 else -- testToken position does not contain an number
1104 --dbms_output.put_line('match on from is_num: FALSE '|| substr(testToken,n,1));
1105 return false;
1106 end if;
1107 elsif (substr(mask,m,1) = '?' AND NOT literalChar)then -- test if testToken has an number in it's corresponding position
1108 if (not(is_num(substr(testToken,n,1)))) then
1109 --dbms_output.put_line('match on NOT num from is_num: '|| substr(testToken,n,1));
1110 matchcnt := matchcnt + 1;
1111 m := m+1;
1112 n := n+1;
1113 else -- testToken position does not contain an number
1114 return false;
1115 end if;
1116 elsif substr(mask,m,1) = substr(testToken,n,1) then
1117 --dbms_output.put_line('match on from test_match_single: '|| substr(testToken,n,1));
1118 matchcnt := matchcnt + 1; -- move on in mask and testToken
1119 m := m+1;
1120 n := n+1;
1121 else -- not a match so return false
1122 --dbms_output.put_line('At last else -- returnig false at test_mask_single');
1123 return false;
1124 end if;
1125 end loop;
1126
1127 --dbms_output.put_line('matchLength and matchcnt '|| maskLength ||' '|| matchcnt);
1128
1129 if testTokenLength = matchcnt then
1130 return true;
1131 else
1132 return false;
1133 end if;
1134 END; -- test_match_single
1135
1136 Function is_num (s1 varchar2)
1137 RETURN boolean IS
1138 BEGIN
1139 IF
1140 INSTR(TRANSLATE(upper(s1),'ABCDEFGHIJKLMNOPQRSTUVWXYZ%@.#?,','$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'),'$') = 0 THEN
1141 RETURN true; -- No characters where translated so s1 is a number
1142 ELSE
1143 RETURN false; -- Characters were translated to $ so s1 was not a number`
1144 END IF;
1145 END; -- is_num
1146
1147 procedure test is
1148 vec1 word_vector;
1149 vec2 word_vector;
1150 result_vec word_vector;
1151 doc varchar2(4000);
1152 begin
1153
1154 doc := 'the QUICK brown Fox because JUMPS o''er the LAZy DOG. The Dogs are LAUGHING to see such foxes AND the
1155 fox-cow JUMPED over the moon because he could';
1156 -- fails
1157 -- doc := 'the QUICK brown Fox because JUMPS over the LAZy DOG The Dogs';
1158 -- ok
1159 -- doc := 'aa BB cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx yy zz';
1160 --??
1161 doc := 'the QUICK brown Fox because JUMPS
1162 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa over the LAZy DOG the p qq rr
1163 ss tt uu vv ww xx yy zz';
1164
1165 vec1 := IEM_PARSER_PVT.compute_vector(
1166 idx_name => 'explain_ex_text',
1167 document => doc,
1168 window_size => 3);
1169
1170 --dbms_output.put_line('----------');
1171
1172 vec2 := IEM_PARSER_PVT.compute_vector(
1173 idx_name => 'explain_ex_text',
1174 document => 'how now red fox. The quick quacking quick brown fox is a lazy dog',
1175 window_size => 3);
1176
1177 result_vec := IEM_PARSER_PVT.compare_vectors(vec1, vec2);
1178
1179 --dbms_output.put_line('Similarity %age is : ' || to_char(round((result_vec.count), 2)));
1180
1181 end;
1182
1183 */
1184 END IEM_PARSER_PVT;