DBA Data[Home] [Help]

PACKAGE BODY: APPS.IEM_PARSER_PVT

Source


1 PACKAGE BODY IEM_PARSER_PVT as
2 /* $Header: iemparbb.pls 120.3 2005/10/03 15:22:51 appldev noship $*/
3 
4 -- TODO:
5 -- add thesaurus processing?
6 
7 -- set serverout on size 100000
8 
9 -----------------------------------------------------------------
10 -- NOTE: CTXSYS must grant user SELECT on CTX_INDEX_VALUES
11 --       ALSO MUST EXPLICITLY GRANT USER 'CREATE TABLE'
12 -----------------------------------------------------------------
13 
14 -- Analyze documents with a view to comparing them
15 -- function COMPUTE_VECTOR calculates the complete set of "phrases"
16 -- for a document, where a phrase is defined a contiguous set of non-stopwords
17 -- of maximum length "window_size" after all punctuation has been removed
18 -- and each word has been "normalized" (reduced to a linguistic stem).
19 
20 -- consider the document:
21 --  'the quick brown fox jumps over the lazy dog'
22 -- where 'the' and 'over' are stopwords.
23 -- if we set our window size to be 2, then the vector will consist of
24 --  "quick" "quick brown" "brown" "brown fox" "fox" "fox jump" "jump"
25 --  "lazy" "lazy dog"
26 -- if window size is set to 3, then we would get
27 --  "quick" "quick brown" "quick brown fox" "brown" "brown fox" "brown fox jump"
28 -- etc.
29 
30 -- "word_vectors" are sorted and duplicates removed.
31 -- they may then be compared using the COMPARE_VECTOR function. This counts
32 -- the number of matching phrases in each vector, and divides them by the total
33 -- number of words to give a percentage match. Identical documents will always
34 -- score 100%.  As many words will NOT be a match, a score of 10% or greater
35 -- generally represents an excellent match.
36 
37 -- the procedure TEST may be used to experiment with scores.
38 
39 -- the procedure P_THEMES may be used in place of a call to CTX_DOC.THEMES,
40 -- as it uses the same interface. Note, however, that many more terms will be
41 -- returned in the theme_table than the typical 16 or 32 returned by
42 -- CTX_DOC.THEMES.  If the output of this function is used in a CONTAINS query,
43 -- then each term should be prefixed by the $ stemming operator - ie. use
44 --   $(term1), $(term2) instead of ABOUT(term1), ABOUT(term2)
45 
46 -- Package variables
47 /*
48 ec_initialized boolean := false;
49 wildCard boolean := false;
50 wildFirst boolean := false;
51 
52 type stoplist_type is record (
53   index_name  varchar2(30),
54   stopwords   word_vector );
55 
56 -- stoplist changes each time we process a new index, but is cached between
57 stoplist stoplist_type;
58 
59 name_already_used exception;
60 PRAGMA EXCEPTION_INIT(name_already_used, -955);
61 
62 imt_error exception;
63 PRAGMA EXCEPTION_INIT(imt_error, -20000);
64 
65 -----------------------------------------------------------------
66 -- The following procedure is the exposed API for the
67 -- Email body parser
68 --
69 -- Input paramters are:
70 -- 	p_message_id      number(15)		-- Required: Message Id from iem_ms_msgbodys;
71 --	p_search_str      varchar2(4000)	-- Required: Input email body search string
72 --	p_idx_name        varchar2		-- Optional Oracle Text Index name,
73 --	p_analyze_length  integer		-- default 4000,
74 --
75 -- TO DO
76 -- Maxlen should be max of 4000 ?
77 -----------------------------------------------------------------
78 function start_parser (
79     p_message_id	number,
80     p_search_str 	varchar2,
81     p_idx_name         	varchar2,
82     p_analyze_length   	integer
83     ) return word_vector is
84 
85     l_window_size 	integer := 0;
86     l_mail_body       	varchar2(4000) := null;
87     l_error_message 	varchar2(200);
88     l_count 		integer := 0;
89     l_maskLength 	integer := 0;
90     l_maxlen   		integer := 0;
91     l_idx_name          varchar2(50);
92     l_search_str        varchar2(4000);
93     l_vec1  		word_vector;
94     l_vec2  		word_vector;
95     l_return_vec	word_vector;
96     INVALID_MAIL_BODY  	EXCEPTION;
97     INVALID_MESSAGE_ID 	EXCEPTION;
98     INVALID_SEARCH_STR 	EXCEPTION;
99     cursor getmsgbody IS
100         SELECT VALUE from IEM_MS_MSGBODYS where p_message_id = message_id;
101 
102 BEGIN
103     l_maskLength := length(p_search_str);
104     l_return_vec := word_vector();  -- initialize
105     --dbms_output.put_line('p_message_id: ' || p_message_id);
106     --dbms_output.put_line('p_search_str: ' || p_search_str);
107     --dbms_output.put_line('p_idx_name: ' ||  p_idx_name);
108     --dbms_output.put_line('p_analyze_length: ' ||  p_analyze_length);
109     --dbms_output.put_line('-- Passed in values --------');
110 
111     -- Check for null values in passed parameters
112     if p_message_id is null then
113         RAISE INVALID_MESSAGE_ID;
114     end if;
115     if p_search_str is null then
116         RAISE INVALID_SEARCH_STR;
117     end if;
118     l_idx_name := p_idx_name;
119     if l_idx_name is null then
120        l_idx_name := 'emc_idx';
121     end if;
122     if p_analyze_length is null then
123         l_maxlen := 4000;
124     else
125         l_maxlen := p_analyze_length;
126     end if;
127 
128     l_maxlen := least(l_maxlen, 32767);
129     l_search_str := p_search_str;
130     if length(l_search_str) > l_maxlen then
131         l_search_str := substr(l_search_str, 1, l_maxlen);
132     end if;
133 
134     l_window_size := IEM_PARSER_PVT.get_window_size(l_search_str);
135 
136     l_count := instr(l_search_str,'%');
137     if l_count > 0 then
138         if  l_count = 1 then
139             wildFirst := true;
140         elsif l_count = l_maskLength then
141             wildCard := true;
142         end if;
143     end if;
144 
145      -- Get message body from IEM_MS_MSGBODYS
146         open getmsgbody;
147 	fetch getmsgbody INTO l_mail_body;
148             if l_mail_body is null then
149         	RAISE INVALID_MAIL_BODY;
150             end if;
151     --dbms_output.put_line('l_mail_body: ' ||  l_mail_body);
152     --dbms_output.put_line('-- l_mail_body --------');
153 
154     --dbms_output.put_line('l_search_str: ' ||  l_search_str);
155     --dbms_output.put_line('-- Pass l_search_str to compute_vector --------');
156 
157     -- Compute Vector for search string
158     l_vec1 := IEM_PARSER_PVT.compute_vector(
159         idx_name => l_idx_name,
160         document => l_search_str,
161         window_size => l_window_size);
162 
163     --IEM_PARSER_PVT.dump_word_vector(l_vec1);
164     --dbms_output.put_line('--  dump_word_vector for l_search_str --------');
165 
166     -- Compute Vector for message body
167     l_vec2 := IEM_PARSER_PVT.compute_vector(
168         idx_name => l_idx_name,
169         document => l_mail_body,
170         window_size => l_window_size);
171 
172     --IEM_PARSER_PVT.dump_word_vector(l_vec2);
173 
174     --dbms_output.put_line('--- dump_word_vector for Mail Body --------------------------');
175 
176    --l_sim := IEM_PARSER_PVT.compare_vectors(l_vec1, l_vec2);
177 
178    -- filp for wild card compare
179    l_return_vec := IEM_PARSER_PVT.compare_vectors(l_vec2, l_vec1);
180 
181    --IEM_PARSER_PVT.dump_word_vector(l_return_vec);
182 
183    --dbms_output.put_line('------ dump_word_vector for Return_vec in start_parser ---------------');
184 
185     if l_return_vec is null then
186         l_return_vec.extend;
187         l_return_vec(1) := 'Error, no return values.';
188         return l_return_vec;
189     else
190         return l_return_vec;
191     end if;
192      EXCEPTION
193         WHEN OTHERS THEN
194             return l_return_vec;
195 END start_parser;
196 
197 function get_stoplist (idx_name varchar2)
198 return word_vector is
199   retlist word_vector;
200   cursor c1 is
201       select upper(ixv_value) wrd
202       from ctxsys.ctx_index_values
203       where ( ixv_index_name = upper ( idx_name )
204               or ( ixv_index_owner||'.'||ixv_index_name = upper ( idx_name ))
205             )
206       and ixv_attribute = 'STOP_WORD';
207 begin
208   open c1;
209   fetch c1 bulk collect into retlist;
210   return retlist;
211 end;
212 
213 -- init : creates the temporary table for explain if it doesn't exist
214 --        and loads stopword list if not already loaded for this index
215 
216 procedure init (idx_name varchar2) is
217   sl word_vector;
218   username varchar2(30);
219 begin
220 
221   -- calculate the stoplist for this table unless already cached
222 
223   if stoplist.index_name is null or stoplist.index_name <> idx_name then
224     stoplist.index_name := idx_name;
225     stoplist.stopwords  := get_stoplist(idx_name);
226   end if;
227 
228   if ec_initialized <> true then
229 
230     begin
231       execute immediate (
232         'create global temporary table ec_ana_explain '        ||
233         '  ( '                            ||
234         '    explain_id   varchar2(30), ' ||
235         '    id           number, '       ||
236         '    parent_id    number, '       ||
237         '    operation    varchar2(30), ' ||
238         '    options      varchar2(30), ' ||
239         '    object_name  varchar2(64), ' ||
240         '    position     number, '       ||
241         '    cardinality  number '        ||
242         '  )');
243     exception
244       when name_already_used then null;
245     end;
246 
247     begin
248       execute immediate (
249         'create table ec_ana_roots '        ||
250         '  ( '                            ||
251         '    token        varchar2(64), ' ||
252         '    root         varchar2(64) '  ||
253         '  )');
254       execute immediate (
255         'create index ec_ana_roots_index '||
256         '  on ec_ana_roots(token)');
257     exception
258       when name_already_used then null;
259     end;
260 
261     ec_initialized := true;
262 
263   end if;
264 
265 end;
266 
267 function is_a_stopword (wrd varchar2)
268 return boolean is
269 l_count integer :=0;
270 begin
271   for i in 1 .. stoplist.stopwords.count loop
272     if wrd = stoplist.stopwords(i) then
273       return true;
274     end if;
275   end loop;
276   --dbms_output.put_line (' In is_a_stopword, wrd =  '|| wrd);
277   return false;
278 end;
279 
280 procedure dump_word_vector (inlist in word_vector) is
281   i         integer;
282 begin
283   for i in 1 .. inlist.count loop
284      dbms_output.put_line('token: '|| inlist(i));
285   end loop;
286 end;
287 
288 procedure move_item (inlist in out NOCOPY word_vector, frm integer, too integer) is
289   tmp varchar2(2000);
290   i integer;
291 begin
292   tmp := inlist(frm);
293   for i in reverse too+1 .. frm loop
294     inlist(i) := inlist(i-1);
295   end loop;
296   inlist(too) := tmp;
297 end;
298 
299 -- insertion sort of word_vector
300 
301 function sort_list (inlist in word_vector, dedupe boolean)
302 return word_vector is
303   retlist word_vector;
304   m integer;
305   n integer;
306   maxi integer;  -- max offset in array
307 begin
308   retlist := inlist;
309 
310   if retlist.count <= 1 then
311     return retlist;
312   end if;
313 
314   for i in 2 .. retlist.count loop
315     for k in 1 .. i loop
316       if retlist(i) < retlist(k) then
317          move_item(retlist, i, k);
318          exit;
319       end if;
320     end loop;
321   end loop;
322 
323   if dedupe then
324     m := 1;
325     maxi := retlist.count;
326     while m < maxi loop
327       if retlist(m) = retlist(m+1) then
328         for n in m .. maxi-1 loop
329           retlist(n) := retlist(n+1);
330         end loop;
331         retlist.delete(maxi);
332         maxi := maxi - 1;
333       else
334         m := m + 1;
335       end if;
336     end loop;
337   end if;
338 
339   return retlist;
340 end;
341 
342 -- call with indexname and word to get the root of
343 
344 function get_root_no_cache (idx_name varchar2, term varchar2)
345 return varchar2 is
346   exid number;
347   retval varchar2(2000);
348 begin
349 
350 --  dbms_output.put_line('root for '||term);
351   begin
352     ctx_query.explain (
353        index_name => idx_name,
354        text_query => '$'||term,
355        explain_table => 'ec_ana_explain',
356        sharelevel    => 0,    -- do share
357        explain_id    => 1 );
358   exception
359     when imt_error then
360        return term;
361   end;
362   begin
363     execute immediate (
364     ' select object_name from ec_ana_explain ' ||
365     ' where explain_id = 1 '               ||
366     ' and position = 1 '                   ||
367     ' and parent_id = 1' ) into retval;
368   exception when no_data_found then
369     retval := term;
370   end;
371 
372   execute immediate ('truncate table ec_ana_explain');
373 
374 --  dbms_output.put_line('is '||retval);
375   return retval;
376 
377 end;
378 
379 
380 function get_root (idx_name varchar2, term varchar2)
381 return varchar2 is
382   exid number;
383   retval varchar2(2000);
384 
385 begin
386 
387   begin
388     execute immediate (
389       'select root from ec_ana_roots where token = :t1')
390       into retval using term;
391 
392   exception when no_data_found then
393     begin
394       ctx_query.explain (
395          index_name => idx_name,
396          text_query => '$'||term,
397          explain_table => 'ec_ana_explain',
398          sharelevel    => 0,    --  do share
399          explain_id    => 1 );
400     exception
401       when imt_error then
402          execute immediate('insert into ec_ana_roots (token, root) values (:t1, :t2)')
403            using term, retval;
404          return term;
405     end;
406 
407     begin
408       execute immediate (
409       ' select object_name from ec_ana_explain ' ||
410       ' where explain_id = 1 '               ||
411       ' and position = 1 '                   ||
412       ' and parent_id = 1' ) into retval;
413     exception when no_data_found then
414       retval := term;
418     execute immediate('insert into ec_ana_roots (token, root) values (:t1, :t2)')
415     end;
416     execute immediate ('truncate table ec_ana_explain');
417 
419       using term, retval;
420 
421     return retval;
422 
423   end;
424 
425   return retval;
426 
427 end;
428 
429 function normalize_list (idx_name varchar2, inlist in word_vector)
430 return word_vector is
431    retlist  word_vector;
432 begin
433    retlist := word_vector();
434    for i in 1 .. inlist.count loop
435      retlist.extend;
436      if is_a_stopword(inlist(i)) then
437        retlist(i) := inlist(i);
438      else
439        retlist(i) := get_root(idx_name, inlist(i));
440      end if;
441    end loop;
442    return retlist;
443 end;
444 
445 function remove_stopwords (inlist in word_vector)
446 return word_vector is
447    cntr     integer := 1;
448    retlist  word_vector;
449 begin
450    retlist := word_vector();
451    for i in 1 .. inlist.count loop
452      if (not (is_a_stopword(inlist(i))) ) then
453        retlist.extend;
454        retlist(cntr) := inlist(i);
455        cntr := cntr + 1;
456      end if;
457    end loop;
458    return retlist;
459 end;
460 
461 -- get phrases
462 
463 -- from a list of words, produce a list of phrases, defined as any
464 -- contiguous list of non-stopwords, up to max_words in length.
465 
466 function get_phrases (inlist word_vector, max_words integer)
467 return word_vector is
468   p integer;
469   phrase varchar2(2000);
470   phrase1 varchar2(2000);
471   space  varchar2(1);
472   cntr integer := 1;
473   inlistCount integer := 0;
474   mw integer := 0;
475   retlist word_vector;
476   newInlist word_vector;
477 
478 begin
479   inlistCount:= inlist.count;
480   mw := max_words;
481   retlist := word_vector();
482  -- newInlist := word_vector();
483  newInlist := inlist;
484 
485  newInlist := remove_stopwords(newInlist);
486 
487  for i in 1 .. newInlist.count loop
488     --if not is_a_stopword(inlist(i)) then
489        space := '';
490        phrase := '';
491        p := 0;
492        while (p <= mw-1 and i+p <= newInlist.count) loop
493        --if (wildCard) OR (wildFirst)  then -- Check for and remove all wild card character from the phrase
494        --      dbms_output.put_line('in get_phrases  wild = true');
495        --      phrase1 := IEM_PARSER_PVT.remove_wild(newInlist(i+p));
496        --      phrase := phrase || space || phrase1 ;
497        --else
498            phrase := phrase || space || newInlist(i+p);
499        --end if;
500        if p = mw-1 then
501              retlist.extend;
502              retlist(cntr) := phrase;
503              --dbms_output.put_line('phrase: '|| phrase);
504              --dbms_output.put_line('');
505              cntr := cntr + 1;
506          end if;
507          space := ' ';
508          p := p+1;
509        end loop;
510     --end if;
511   end loop;
512   return retlist;
513 end;
514 
515 function parse_string (stringToParse in varchar2)
516 return word_vector is
517    ws       varchar2(32767);        -- workspace
518    theword  varchar2(2000);
519    p        integer;
520    cntr     integer := 1;
521    rettab   word_vector;
522 
523    c        integer := 0;
524 
525 begin
526 
527    rettab := word_vector();  -- initialize
528 
529    ws := stringToParse;
530 
531    -- translate all punctuation into spaces
532 
533   --  ws := translate (ws, '`!"$%^&*()_+{}:@~|<>?-=[];''#\,./',
534   --                      '                                 ');
535   -- Don't remove %,@,.,#,?,'and comma chars, use to denote wild card or email address.
536   -- If changed, be sure to add to the is_num function
537 
538 -- Commited out for R12 patch 4417790 issue
539 
540       --ws := translate (ws, '`!"$^&*()_+{}~|<>-=;\,/',
541                            '                               ');
542 
543    -- and whitespace control chars
544    ws := translate (ws, fnd_global.local_chr(10)||fnd_global.local_chr(11)||fnd_global.local_chr(12)||fnd_global.local_chr(13), '    ');
545 
546    -- upper case it all
547    ws := translate (ws, 'abcdefghijklmnopqrstuvwxyz',
548                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ');
549 
550    -- remove multiple spaces
551    while (instr(ws, '  ', 1) > 0) loop
552      ws := replace (ws, '  ', ' ');
553    end loop;
554 
555    -- and any leading or trailing space
556    if (substr(ws, 1, 1) = ' ') then
557      ws := substr(ws, 2, length(ws)-1);
558    end if;
559 
560    if (substr(ws, length(ws), 1) = ' ') then
561      ws := substr(ws, 1, length(ws)-1);
562    end if;
563 
564    theword := substr(ws, 280, 40);
565 
566    p := instr(ws, ' ');
567 
568    while p > 0 loop
569 
570       c := c+1;
571 
572       theword := substr(ws, 1, p-1);
573 
574       -- save word. Simply discard it if too long
575       if length(theword) <= MAXWORDLENGTH then
576         rettab.extend;
577         rettab(cntr) := theword;
578         cntr := cntr + 1;
579       end if;
580 
581       ws := substr(ws, p+1, length(ws)-p);
582       p := instr(ws, ' ');
583 
584    end loop;
585 
586    theword := ws;
587    if length(theword) > 0 then
588      rettab.extend;
589      rettab(cntr) := theword;
590    end if;
591 
592    return rettab;
593 end;
594 
595 -- compute the word vector
599   document         in varchar2,
596 
597 function compute_vector (
598   idx_name         in varchar2, --DEFAULT fnd_api.g_miss_char,
600   analyze_length   in integer default 5000,
601   window_size      in integer default 3
602   )
603 return word_vector is
604   maxlen   integer;
605   ws       integer;
606   thedoc   varchar2(32767);
607   retlist  word_vector;
608   t1 number; t2 number; t3 number; t4 number; t5 number;
609 begin
610 
611   if analyze_length is null then
612     maxlen := 5000;
613   else
614     maxlen := analyze_length;
615   end if;
616 
617   maxlen := least(maxlen, 32767);
618 
619   if window_size is null then
620     ws := 3;
621   else
622     ws := window_size;
623   end if;
624 
625   if length(document) > maxlen then
626     thedoc := substr(document, 1, maxlen);
627   else
628     thedoc := document;
629   end if;
630 
631   init(idx_name);
632 
633 --t1 := dbms_utility.get_time;
634 
635   retlist := parse_string(thedoc);
636   --dbms_output.put_line ('This is retlist after parse_string :'||  retlist(1));
637 
638 --t2 := dbms_utility.get_time;
639 --dbms_output.put_line('parse     ' || to_char((t2-t1)/100));
640 
641   --retlist := normalize_list ( idx_name, retlist );
642 
643 --t3 := dbms_utility.get_time;
644 -- dbms_output.put_line('Normalise ' || to_char((t3-t2)/100));
645 
646   --dbms_output.put_line ('Just before get_phrases');
647   --dbms_output.put_line ('This is retlist.count :'||  retlist.count);
648 
649   retlist := get_phrases    ( retlist, ws );
650 
651   --dbms_output.put_line ('Just after get_phrases');
652   --dbms_output.put_line ('This is retlist.count :'||  retlist.count);
653 
654 --t4 := dbms_utility.get_time;
655 --dbms_output.put_line('phrase    ' || to_char((t4-t3)/100));
656 
657   -- retlist := sort_list      ( retlist, true );
658   retlist := sort_list      ( retlist, false ); -- Don't remove duplicates
659 
660  --dbms_output.put_line ('This is retlist after sort :'||  retlist);
661  --for i in 1 .. retlist.count loop
662      --dbms_output.put_line ('This is retlist after sort :'||  retlist(i));
663  --end loop;
664 
665 --t5 := dbms_utility.get_time;
666 --dbms_output.put_line('Normalise ' || to_char((t3-t2)/100)||' sort ' || to_char((t5-t4)/100));
667 
668   return retlist;
669 end;
670 
671 -- same function for clobs
672 
673 function compute_vector (
674   idx_name         in varchar2,
675   document         in clob,
676   analyze_length   in integer default 5000,
677   window_size      in integer default 3
678   )
679 return word_vector is
680   retlist   word_vector;
681   maxlen    integer;
682   thestrng  varchar2(32767);
683   theoffset integer;
684   thesize   integer;
685   thebuff   varchar2(32767);
686 begin
687 
688   if analyze_length is null then
689     maxlen := 5000;
690   else
691     maxlen := analyze_length;
692   end if;
693 
694   maxlen := least(maxlen, 32767);
695 
696   theoffset := 1;
697   thestrng  := ' ';
698   while (true) loop
699 
700     begin
701       thesize := maxlen;
702       dbms_lob.read(
703         lob_loc    => document,
704         amount     => thesize,
705         offset     => theoffset,
706         buffer     => thebuff );
707     exception
708      when no_data_found then
709       exit;
710     end;
711 
712     exit when (thesize <= 0);
713     theoffset := thesize+1;
714 
715     if length(thestrng) + thesize < maxlen then
716       thestrng := thestrng || thebuff;
717     else
718       thestrng := thestrng || substr(thebuff, 1, maxlen-length(thestrng)-1);
719       exit;
720     end if;
721 
722   end loop;
723 
724   if length(thestrng) + thesize < maxlen then
725     thestrng := thestrng || thebuff;
726   else
727     thestrng := thestrng || substr(thebuff, 1, maxlen-length(thestrng)-1);
728   end if;
729 
730   retlist := compute_vector (idx_name, thestrng, maxlen, window_size);
731 
732   return retlist;
733 end;
734 
735 -- compare two lists of phrases
736 -- resulting similarity quotient is based on number of shared phrases,
737 -- reduced according to length of each vector.
738 
739 function compare_vectors (inlist1 in word_vector, inlist2 in word_vector)
740 return  word_vector is
741   result_array 		word_vector;
742   matchcnt 		integer := 0;
743   j  			integer := 1;
744   k  			integer := 1;
745   l_Error_Message	varchar2(200);
746   INVALID_WORD_VECTOR	EXCEPTION;
747 begin
748 
749   result_array := word_vector();  -- initialize
750 
751 
752   if inlist1 is null or inlist2 is null then
753      RAISE INVALID_WORD_VECTOR;
754   end if;
755 
756   if inlist1.count = 0 or inlist2.count = 0 then
757      RAISE INVALID_WORD_VECTOR;
758   end if;
759 
760 
761    while (j <= inlist1.count and k <= inlist2.count) loop
762      while (k <= inlist2.count and j <= inlist1.count) loop
763 
764       -- Don't count the wild # as a direct match in string when it is not quoted
765       if (inlist1(j) = inlist2(k)) and not (instr(inlist2(k), '#') > 0)  then
766           --dbms_output.put_line('***** match on in compare vector ***** '|| inlist1(j));
767           matchcnt := matchcnt + 1;
768           --dbms_output.put_line('***** matchcnt ***** '|| matchcnt);
769           result_array.extend;
770           result_array(matchcnt) :=  inlist1(j);
774       elsif  (instr(inlist2(k), '%') > 0) then
771           --dbms_output.put_line('***** result_array(matchcnt) ***** '|| result_array(matchcnt));
772           j := j+1;
773           exit;
775           --dbms_output.put_line('---------------------------');
776           --dbms_output.put_line('---------------------------');
777           --dbms_output.put_line('At wild % in compare_vec');
778           --dbms_output.put_line('This is inlist1(j)' || inlist1(j));
779           --dbms_output.put_line('This is inlist2(k)' || inlist2(k));
780           if (test_match(inlist2(k), inlist1(j))) then
781               --dbms_output.put_line('match on %  in compare_vectors '|| inlist1(j));
782               matchcnt := matchcnt + 1; -- update match count and move on
783               result_array.extend;
784               result_array(matchcnt) :=  inlist1(j);
785               j := j+1;
786               exit;
787           else -- just move to the next word
788               j := j+1;
789               exit;
790           end if;
791       elsif  (instr(inlist2(k), '#') > 0) then
792           --dbms_output.put_line('---------------------------');
793           --dbms_output.put_line('---------------------------');
794           --dbms_output.put_line('At wild # in compare_vec');
795           --dbms_output.put_line('This is inlist1(j)' || inlist1(j));
796           --dbms_output.put_line('This is inlist2(k)' || inlist2(k));
797           if (test_match_single(inlist2(k), inlist1(j))) then
798               --dbms_output.put_line('match on is_num in compare_vectors '|| inlist1(j));
799               matchcnt := matchcnt + 1; -- update match count and move on
800               result_array.extend;
801               result_array(matchcnt) :=  inlist1(j);
802               j := j+1;
803               exit;
804           else -- just move to the next word
805               j := j+1;
806               exit;
807           end if;
808       elsif  (instr(inlist2(k), '?') > 0) then
809           --dbms_output.put_line('---------------------------');
810           --dbms_output.put_line('---------------------------');
811           --dbms_output.put_line('At wild ? in compare_vec');
812           --dbms_output.put_line('This is inlist1(j)' || inlist1(j));
813           --dbms_output.put_line('This is inlist2(k)' || inlist2(k));
814           if (test_match_single(inlist2(k), inlist1(j))) then
815               --dbms_output.put_line('match on NOT is_num in compare_vectors '|| inlist1(j));
816               matchcnt := matchcnt + 1; -- update match count and move on
817               result_array.extend;
818               result_array(matchcnt) :=  inlist1(j);
819               j := j+1;
820               exit;
821           else -- just move to the next word
822               j := j+1;
823               exit;
824           end if;
825       elsif (inlist2(k) > inlist1(j)) or (wildCard) then
826         j := j+1;
827         --dbms_output.put_line('j+1 is  at k > j: '|| j);
828         --dbms_output.put_line('This is inlist1(j) at k > j: ' || inlist1(j));
829         --dbms_output.put_line('This is inlist2(k) at k > j: ' || inlist2(k));
830         exit;
831       end if;
832       k := k+1;
833     end loop;
834   end loop;
835   --dbms_output.put_line('Matches   : '||to_char(matchcnt));
836   --dbms_output.put_line('Word Count: '||to_char(inlist1.count+inlist2.count));
837   --dbms_output.put_line('Mail body word count: '||to_char(inlist1.count));
838   --dbms_output.put_line('sqrt      : '||to_char(sqrt(sqrt(inlist1.count+inlist2.count))));
839   --dbms_output.put_line(inlist1.count+inlist2.count);
840  wildCard := false; -- reset global variable
841       --dbms_output.put_line (' --------- wildCard set to false ----------- ');
842   --return (matchcnt*1.0)/sqrt(sqrt((inlist1.count+inlist2.count)))*100;
843   --return ((matchcnt*1.0)/(inlist1.count))*100;
844   --return ((matchcnt*1.0)/(inlist2.count))*100;
845     return result_array;
846     EXCEPTION
847         WHEN INVALID_WORD_VECTOR THEN
848         FND_MESSAGE.SET_NAME('IEM','IEM_INVALID_WORD_VECTOR');
849         l_Error_Message := FND_MESSAGE.GET;
850         fnd_file.put_line(fnd_file.log, l_Error_Message);
851 
852 end;
853 
854 procedure p_themes (
855    index_name      in  varchar2,
856 --   textkey         in  varchar2,
857 --   restab          in out  ctx_doc.theme_tab,
858    restab       in out NOCOPY iem_im_wrappers_pvt.theme_table,
859    full_themes     in  boolean default true,
860    num_themes      in  number default 32,
861    document        in  varchar2,
862    pictograph      in boolean default false
863    )
864 is
865 --  thedoc           clob;
866   wv               word_vector;
867 begin
868 
869 
870 --  ctx_doc.filter(
871 --      index_name => index_name,
872 --      textkey    => textkey,
873 --      restab     => thedoc,
874 --      plaintext  => true );
875 
876   wv := IEM_PARSER_PVT.compute_vector(
877 --      idx_name       => 'acr_resp_index',
878 idx_name       => index_name,
879       document       => document,
880       analyze_length => DOCLENGTH,
881       window_size    => WINDOWSIZE );
882 
883   for i in 1 .. wv.count loop
884     IF wv(i) <>' ' then
885     	restab(i).theme  := wv(i);
886     	restab(i).weight := 1;
887     END IF;
888   end loop;
889 
890 end;
891 
892 function get_window_size (search_string varchar2)
893 return integer is
894 
895 INVALID_LOB              EXCEPTION;
896 ws       varchar2(32767);        -- workspace
897 theword  varchar2(2000);
898 p        integer;
899 cntr     integer := 1;
900 rettab   word_vector;
901 window_size integer := 0;
902 
903 BEGIN
904 
905 --  Find the window size/number of words in the search phrase
906 
910      ws := replace (ws, '  ', ' ');
907    ws := search_string;
908 -- remove multiple spaces
909    while (instr(ws, '  ', 1) > 0) loop
911    end loop;
912 
913    -- and any leading or trailing space
914    if (substr(ws, 1, 1) = ' ') then
915      ws := substr(ws, 2, length(ws)-1);
916    end if;
917 
918    if (substr(ws, length(ws), 1) = ' ') then
919      ws := substr(ws, 1, length(ws)-1);
920    end if;
921 
922    theword := substr(ws, 280, 40);
923 
924    p := instr(ws, ' ');
925 
926    while p > 0 loop
927     window_size := window_size+1;
928 
929       ws := substr(ws, p+1, length(ws)-p);
930 
931       p := instr(ws, ' ');
932 
933    end loop;
934    window_size := window_size+1;  -- Final window/word count
935    return window_size;
936 END; -- get_window_size
937 
938 function remove_wild (mask varchar2)
939 return varchar2 is
940 
941 newMask  varchar2(2000);
942 l_count     integer := 0;
943 maskLength integer := 0;
944 
945 BEGIN
946     l_count := instr(mask, '%');
947     maskLength := length(mask);
948     if l_count > 1 then -- % is at the end of the word
949         newMask  := substr(mask, 1, maskLength -1); -- Remove % form the wnd of the mask
950         return newMask;
951     elsif l_count = 1 then -- % is at the begining of the word
952         newMask  := substr(mask, 2, maskLength -1);
953         return newMask;
954    end if;
955    return mask;
956 END;
957 
958 function test_match (mask varchar2, testToken varchar2)
959 return boolean is
960 
961 maskLength integer := 0;
962 testTokenLength integer := 0;
963 
964  matchcnt integer := 0;
965   m  integer := 1;
966   n  integer := 1;
967   literalChar boolean := false;
968 
969 BEGIN
970 
971   maskLength := length(mask);
972   testTokenLength := length(testToken);
973 
974   if mask is null or testToken is null then
975     return false;
976   end if;
977 
978   if maskLength = 0 or testTokenLength = 0 then
979     return false;
980   end if;
981 
982   while (m <= maskLength and n <= testTokenLength) loop
983       --dbms_output.put_line('-------------------------------------');
984       --dbms_output.put_line('substr(mask,m,1) '|| substr(mask,m,1));
985       --dbms_output.put_line('substr(testToken,n,1) '|| substr(testToken,n,1));
986 
987       if (substr(mask,m,1) = fnd_global.local_chr(39)) then
988           --dbms_output.put_line('At chr(39)');
989           m := m+1; -- move to the next character in the mask
990           --dbms_output.put_line('substr(mask,m,1) '|| substr(mask,m,1));
991           --dbms_output.put_line('=-=-=-=-=-=-=-=-=-=-=-=-=-=');
992           matchcnt := matchcnt + 1; --  count this as a match.
993           if  (literalChar = false) then
994               literalChar := true; --flip the literal character flag
995           else
996               literalChar := false;
997          end if;
998       elsif (substr(mask,m,1) = '#' AND NOT literalChar) then -- We need to see if the current testToken char is anumber
999           if (is_num(substr(testToken,n,1))) then
1000               m := m+1; -- move to the next character in the mask
1001               matchcnt := matchcnt + 1; --  this is a match.
1002               n := n+1; -- move to the next testToken char
1003               --dbms_output.put_line('(substr(mask,m,1) TRUE '|| substr(mask,m,1));
1004           else
1005               --dbms_output.put_line('substr(mask,m,1) FALSE '|| substr(mask,m,1));
1006               return false; -- This is not a match for this token
1007           end if;
1008       elsif (substr(mask,m,1) = '?' AND NOT literalChar) then -- skip this in the mask and count it as a
1009           if (NOT(is_num(substr(testToken,n,1)))) then
1010               m := m+1; -- move to the next character in the mask
1011               matchcnt := matchcnt + 1; --  this is a match
1012               n := n+1; -- move to the next testToken char
1013           else
1014               return false; -- This is not a match for this token
1015           end if;
1016       elsif (substr(mask,m,1) = '%' AND NOT literalChar) then -- test if testToken has a match for then next mask character
1017                                      -- anywhere in it's string.
1018           m := m+1; -- move to the next character in the mask
1019           matchcnt := matchcnt + 1; -- have the wild card march.
1020           n := n+1; -- The % will match any single character in the testToken string
1021 
1022           while (m <= maskLength and n <= testTokenLength) loop
1023               if substr(mask,m,1) = substr(testToken,n,1) then
1024                   --dbms_output.put_line('match on new and improved Wild '|| substr(testToken,n,1));
1025                   matchcnt := matchcnt + 1;
1026                   m := m+1;
1027                   n := n+1;
1028               else
1029                   -- Need to take into account the single quote
1030                   if (substr(mask,m,1) = '%' OR substr(mask,m,1) = '#' OR  substr(mask,m,1) = '?' OR substr(mask,m,1) = fnd_global.local_chr(39)) then
1031                       exit; -- leave inner loop and continue processing strings
1032                   else
1033                       n :=n+1;  -- chars don't match so move n along
1034                  end if;
1035               end if;
1036           end loop;
1037       elsif substr(mask,m,1) = substr(testToken,n,1) then
1038           --dbms_output.put_line('match on from new test_match: '|| substr(testToken,n,1));
1039           matchcnt := matchcnt + 1; -- move on in mask and testToken
1040           m := m+1;
1041           n := n+1;
1042       else  -- not a match so return false
1043            --dbms_output.put_line('At last else -- returnig false at New test_match');
1044           return false;
1045       end if;
1049 
1046     end loop;
1047 
1048     --dbms_output.put_line('matchLength and matchcnt '|| maskLength ||' '|| matchcnt);
1050     if maskLength = matchcnt then
1051         return true;
1052     else
1053         return false;
1054     end if;
1055 END; -- test_match
1056 
1057 function test_match_single (mask varchar2, testToken varchar2)
1058 return boolean is
1059 
1060 maskLength integer := 0;
1061 testTokenLength integer := 0;
1062 
1063  matchcnt integer := 0;
1064   m  integer := 1;
1065   n  integer := 1;
1066   literalChar boolean := false;
1067 
1068 BEGIN
1069 
1070   maskLength := length(mask);
1071   testTokenLength := length(testToken);
1072 
1073   if mask is null or testToken is null then
1074     return false;
1075   end if;
1076 
1077   if maskLength = 0 or testTokenLength = 0 then
1078     return false;
1079   end if;
1080 
1081   while (m <= maskLength and n <= testTokenLength) loop
1082       --dbms_output.put_line('-------------------------------------');
1083       --dbms_output.put_line('substr(mask,m,1) '|| substr(mask,m,1));
1084       --dbms_output.put_line('substr(testToken,n,1) '|| substr(testToken,n,1));
1085 
1086       if (substr(mask,m,1) = fnd_global.local_chr(39)) then
1087           --dbms_output.put_line('At chr(39)');
1088           m := m+1; -- move to the next character in the mask
1089           --dbms_output.put_line('substr(mask,m,1) '|| substr(mask,m,1));
1090           --dbms_output.put_line('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<');
1091           if  (literalChar = false) then
1092               literalChar := true; --flip the literal character flag
1093           else
1094               literalChar := false;
1095          end if;
1096 
1097       elsif( substr(mask,m,1) = '#'  AND NOT literalChar) then -- test if testToken has an number in it's corresponding position
1098             if (is_num(substr(testToken,n,1))) then
1099                --dbms_output.put_line('match on from is_num: TRUE '|| substr(testToken,n,1));
1100                matchcnt := matchcnt + 1;
1101                m := m+1;
1102                n := n+1;
1103             else -- testToken position does not contain an number
1104                 --dbms_output.put_line('match on from is_num: FALSE '|| substr(testToken,n,1));
1105                 return false;
1106             end if;
1107       elsif (substr(mask,m,1) = '?'  AND NOT literalChar)then -- test if testToken has an number in it's corresponding position
1108             if (not(is_num(substr(testToken,n,1)))) then
1109                --dbms_output.put_line('match on NOT num from is_num: '|| substr(testToken,n,1));
1110                matchcnt := matchcnt + 1;
1111                m := m+1;
1112                n := n+1;
1113             else -- testToken position does not contain an number
1114                 return false;
1115             end if;
1116       elsif substr(mask,m,1) = substr(testToken,n,1) then
1117           --dbms_output.put_line('match on from test_match_single: '|| substr(testToken,n,1));
1118           matchcnt := matchcnt + 1; -- move on in mask and testToken
1119           m := m+1;
1120           n := n+1;
1121       else  -- not a match so return false
1122            --dbms_output.put_line('At last else -- returnig false at test_mask_single');
1123           return false;
1124       end if;
1125     end loop;
1126 
1127     --dbms_output.put_line('matchLength and matchcnt '|| maskLength ||' '|| matchcnt);
1128 
1129     if testTokenLength = matchcnt then
1130         return true;
1131     else
1132         return false;
1133     end if;
1134 END; -- test_match_single
1135 
1136 Function is_num (s1 varchar2)
1137   RETURN  boolean IS
1138 BEGIN
1139     IF
1140 INSTR(TRANSLATE(upper(s1),'ABCDEFGHIJKLMNOPQRSTUVWXYZ%@.#?,','$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'),'$') = 0 THEN
1141        RETURN true; -- No characters where translated so s1 is a number
1142     ELSE
1143        RETURN false; -- Characters were translated to $ so s1 was not a number`
1144     END IF;
1145 END; -- is_num
1146 
1147 procedure test is
1148   vec1 		word_vector;
1149   vec2 		word_vector;
1150   result_vec	word_vector;
1151   doc  		varchar2(4000);
1152 begin
1153 
1154   doc := 'the QUICK brown Fox because JUMPS o''er the LAZy DOG. The Dogs are LAUGHING to see such foxes AND the
1155 fox-cow JUMPED over the moon because he could';
1156 -- fails
1157 --  doc := 'the QUICK brown Fox because JUMPS over the LAZy DOG The Dogs';
1158 -- ok
1159 --  doc := 'aa BB cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx yy zz';
1160 --??
1161     doc := 'the QUICK brown Fox because JUMPS
1162 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa over the LAZy DOG the p qq rr
1163 ss tt uu vv ww xx yy zz';
1164 
1165   vec1 := IEM_PARSER_PVT.compute_vector(
1166     idx_name => 'explain_ex_text',
1167     document => doc,
1168     window_size => 3);
1169 
1170 --dbms_output.put_line('----------');
1171 
1172   vec2 := IEM_PARSER_PVT.compute_vector(
1173     idx_name => 'explain_ex_text',
1174     document => 'how now red fox. The quick quacking quick brown fox is a lazy dog',
1175     window_size => 3);
1176 
1177   result_vec := IEM_PARSER_PVT.compare_vectors(vec1, vec2);
1178 
1179   --dbms_output.put_line('Similarity %age is : ' || to_char(round((result_vec.count), 2)));
1180 
1181 end;
1182 
1183 */
1184 END IEM_PARSER_PVT;