eTRM uses javascript that opens and focuses a separate window to display reports. Your browser does not appear to support javascript. This should not prevent you from using eTRM but it does mean the window focus feature will not work. Please check the contents of other windows on your desktop if you click on a link that does not appear to do anything.
[Home] [Help]
PACKAGE BODY: APPS.IEM_PARSER_PVT

Source


1 PACKAGE BODY IEM_PARSER_PVT as
2 /* $Header: iemparbb.pls 120.3 2005/10/03 15:22:51 appldev noship $*/
3 
4 -- TODO:
5 -- add thesaurus processing?
6 
7 -- set serverout on size 100000
8 
9 -----------------------------------------------------------------
10 -- NOTE: CTXSYS must grant user SELECT on CTX_INDEX_VALUES
11 --       ALSO MUST EXPLICITLY GRANT USER 'CREATE TABLE'
12 -----------------------------------------------------------------
13 
14 -- Analyze documents with a view to comparing them
15 -- function COMPUTE_VECTOR calculates the complete set of "phrases"
16 -- for a document, where a phrase is defined a contiguous set of non-stopwords
17 -- of maximum length "window_size" after all punctuation has been removed
18 -- and each word has been "normalized" (reduced to a linguistic stem).
19 
20 -- consider the document:
21 --  'the quick brown fox jumps over the lazy dog'
22 -- where 'the' and 'over' are stopwords.
23 -- if we set our window size to be 2, then the vector will consist of
24 --  "quick" "quick brown" "brown" "brown fox" "fox" "fox jump" "jump"
25 --  "lazy" "lazy dog"
26 -- if window size is set to 3, then we would get
27 --  "quick" "quick brown" "quick brown fox" "brown" "brown fox" "brown fox jump"
28 -- etc.
29 
30 -- "word_vectors" are sorted and duplicates removed.
31 -- they may then be compared using the COMPARE_VECTOR function. This counts
32 -- the number of matching phrases in each vector, and divides them by the total
33 -- number of words to give a percentage match. Identical documents will always
34 -- score 100%.  As many words will NOT be a match, a score of 10% or greater
35 -- generally represents an excellent match.
36 
37 -- the procedure TEST may be used to experiment with scores.
38 
39 -- the procedure P_THEMES may be used in place of a call to CTX_DOC.THEMES,
40 -- as it uses the same interface. Note, however, that many more terms will be
41 -- returned in the theme_table than the typical 16 or 32 returned by
42 -- CTX_DOC.THEMES.  If the output of this function is used in a CONTAINS query,
43 -- then each term should be prefixed by the $ stemming operator - ie. use
44 --   $(term1), $(term2) instead of ABOUT(term1), ABOUT(term2)
45 
46 -- Package variables
47 /*
48 ec_initialized boolean := false;
49 wildCard boolean := false;
50 wildFirst boolean := false;
51 
52 type stoplist_type is record (
53   index_name  varchar2(30),
54   stopwords   word_vector );
55 
56 -- stoplist changes each time we process a new index, but is cached between
57 stoplist stoplist_type;
58 
59 name_already_used exception;
60 PRAGMA EXCEPTION_INIT(name_already_used, -955);
61 
62 imt_error exception;
63 PRAGMA EXCEPTION_INIT(imt_error, -20000);
64 
65 -----------------------------------------------------------------
66 -- The following procedure is the exposed API for the
67 -- Email body parser
68 --
69 -- Input paramters are:
70 -- 	p_message_id      number(15)		-- Required: Message Id from iem_ms_msgbodys;
71 --	p_search_str      varchar2(4000)	-- Required: Input email body search string
72 --	p_idx_name        varchar2		-- Optional Oracle Text Index name,
73 --	p_analyze_length  integer		-- default 4000,
74 --
75 -- TO DO
76 -- Maxlen should be max of 4000 ?
77 -----------------------------------------------------------------
78 function start_parser (
79     p_message_id	number,
80     p_search_str 	varchar2,
81     p_idx_name         	varchar2,
82     p_analyze_length   	integer
83     ) return word_vector is
84 
85     l_window_size 	integer := 0;
86     l_mail_body       	varchar2(4000) := null;
87     l_error_message 	varchar2(200);
88     l_count 		integer := 0;
89     l_maskLength 	integer := 0;
90     l_maxlen   		integer := 0;
91     l_idx_name          varchar2(50);
92     l_search_str        varchar2(4000);
93     l_vec1  		word_vector;
94     l_vec2  		word_vector;
95     l_return_vec	word_vector;
96     INVALID_MAIL_BODY  	EXCEPTION;
97     INVALID_MESSAGE_ID 	EXCEPTION;
98     INVALID_SEARCH_STR 	EXCEPTION;
99     cursor getmsgbody IS
100         SELECT VALUE from IEM_MS_MSGBODYS where p_message_id = message_id;
101 
102 BEGIN
103     l_maskLength := length(p_search_str);
104     l_return_vec := word_vector();  -- initialize
105     --dbms_output.put_line('p_message_id: ' || p_message_id);
106     --dbms_output.put_line('p_search_str: ' || p_search_str);
107     --dbms_output.put_line('p_idx_name: ' ||  p_idx_name);
108     --dbms_output.put_line('p_analyze_length: ' ||  p_analyze_length);
109     --dbms_output.put_line('-- Passed in values --------');
110 
111     -- Check for null values in passed parameters
112     if p_message_id is null then
113         RAISE INVALID_MESSAGE_ID;
114     end if;
115     if p_search_str is null then
116         RAISE INVALID_SEARCH_STR;
117     end if;
118     l_idx_name := p_idx_name;
119     if l_idx_name is null then
120        l_idx_name := 'emc_idx';
121     end if;
122     if p_analyze_length is null then
123         l_maxlen := 4000;
124     else
125         l_maxlen := p_analyze_length;
126     end if;
127 
128     l_maxlen := least(l_maxlen, 32767);
129     l_search_str := p_search_str;
130     if length(l_search_str) > l_maxlen then
131         l_search_str := substr(l_search_str, 1, l_maxlen);
132     end if;
133 
134     l_window_size := IEM_PARSER_PVT.get_window_size(l_search_str);
135 
136     l_count := instr(l_search_str,'%');
137     if l_count > 0 then
138         if  l_count = 1 then
139             wildFirst := true;
140         elsif l_count = l_maskLength then
141             wildCard := true;
142         end if;
143     end if;
144 
145      -- Get message body from IEM_MS_MSGBODYS
146         open getmsgbody;
147 	fetch getmsgbody INTO l_mail_body;
148             if l_mail_body is null then
149         	RAISE INVALID_MAIL_BODY;
150             end if;
151     --dbms_output.put_line('l_mail_body: ' ||  l_mail_body);
152     --dbms_output.put_line('-- l_mail_body --------');
153 
154     --dbms_output.put_line('l_search_str: ' ||  l_search_str);
155     --dbms_output.put_line('-- Pass l_search_str to compute_vector --------');
156 
157     -- Compute Vector for search string
158     l_vec1 := IEM_PARSER_PVT.compute_vector(
159         idx_name => l_idx_name,
160         document => l_search_str,
161         window_size => l_window_size);
162 
163     --IEM_PARSER_PVT.dump_word_vector(l_vec1);
164     --dbms_output.put_line('--  dump_word_vector for l_search_str --------');
165 
166     -- Compute Vector for message body
167     l_vec2 := IEM_PARSER_PVT.compute_vector(
168         idx_name => l_idx_name,
169         document => l_mail_body,
170         window_size => l_window_size);
171 
172     --IEM_PARSER_PVT.dump_word_vector(l_vec2);
173 
174     --dbms_output.put_line('--- dump_word_vector for Mail Body --------------------------');
175 
176    --l_sim := IEM_PARSER_PVT.compare_vectors(l_vec1, l_vec2);
177 
178    -- filp for wild card compare
179    l_return_vec := IEM_PARSER_PVT.compare_vectors(l_vec2, l_vec1);
180 
181    --IEM_PARSER_PVT.dump_word_vector(l_return_vec);
182 
183    --dbms_output.put_line('------ dump_word_vector for Return_vec in start_parser ---------------');
184 
185     if l_return_vec is null then
186         l_return_vec.extend;
187         l_return_vec(1) := 'Error, no return values.';
188         return l_return_vec;
189     else
190         return l_return_vec;
191     end if;
192      EXCEPTION
193         WHEN OTHERS THEN
194             return l_return_vec;
195 END start_parser;
196 
197 function get_stoplist (idx_name varchar2)
198 return word_vector is
199   retlist word_vector;
200   cursor c1 is
201       select upper(ixv_value) wrd
202       from ctxsys.ctx_index_values
203       where ( ixv_index_name = upper ( idx_name )
204               or ( ixv_index_owner||'.'||ixv_index_name = upper ( idx_name ))
205             )
206       and ixv_attribute = 'STOP_WORD';
207 begin
208   open c1;
209   fetch c1 bulk collect into retlist;
210   return retlist;
211 end;
212 
213 -- init : creates the temporary table for explain if it doesn't exist
214 --        and loads stopword list if not already loaded for this index
215 
216 procedure init (idx_name varchar2) is
217   sl word_vector;
218   username varchar2(30);
219 begin
220 
221   -- calculate the stoplist for this table unless already cached
222 
223   if stoplist.index_name is null or stoplist.index_name <> idx_name then
224     stoplist.index_name := idx_name;
225     stoplist.stopwords  := get_stoplist(idx_name);
226   end if;
227 
228   if ec_initialized <> true then
229 
230     begin
231       execute immediate (
232         'create global temporary table ec_ana_explain '        ||
233         '  ( '                            ||
234         '    explain_id   varchar2(30), ' ||
235         '    id           number, '       ||
236         '    parent_id    number, '       ||
237         '    operation    varchar2(30), ' ||
238         '    options      varchar2(30), ' ||
239         '    object_name  varchar2(64), ' ||
240         '    position     number, '       ||
241         '    cardinality  number '        ||
242         '  )');
243     exception
244       when name_already_used then null;
245     end;
246 
247     begin
248       execute immediate (
249         'create table ec_ana_roots '        ||
250         '  ( '                            ||
251         '    token        varchar2(64), ' ||
252         '    root         varchar2(64) '  ||
253         '  )');
254       execute immediate (
255         'create index ec_ana_roots_index '||
256         '  on ec_ana_roots(token)');
257     exception
258       when name_already_used then null;
259     end;
260 
261     ec_initialized := true;
262 
263   end if;
264 
265 end;
266 
267 function is_a_stopword (wrd varchar2)
268 return boolean is
269 l_count integer :=0;
270 begin
271   for i in 1 .. stoplist.stopwords.count loop
272     if wrd = stoplist.stopwords(i) then
273       return true;
274     end if;
275   end loop;
276   --dbms_output.put_line (' In is_a_stopword, wrd =  '|| wrd);
277   return false;
278 end;
279 
280 procedure dump_word_vector (inlist in word_vector) is
281   i         integer;
282 begin
283   for i in 1 .. inlist.count loop
284      dbms_output.put_line('token: '|| inlist(i));
285   end loop;
286 end;
287 
288 procedure move_item (inlist in out NOCOPY word_vector, frm integer, too integer) is
289   tmp varchar2(2000);
290   i integer;
291 begin
292   tmp := inlist(frm);
293   for i in reverse too+1 .. frm loop
294     inlist(i) := inlist(i-1);
295   end loop;
296   inlist(too) := tmp;
297 end;
298 
299 -- insertion sort of word_vector
300 
301 function sort_list (inlist in word_vector, dedupe boolean)
302 return word_vector is
303   retlist word_vector;
304   m integer;
305   n integer;
306   maxi integer;  -- max offset in array
307 begin
308   retlist := inlist;
309 
310   if retlist.count <= 1 then
311     return retlist;
312   end if;
313 
314   for i in 2 .. retlist.count loop
315     for k in 1 .. i loop
316       if retlist(i) < retlist(k) then
317          move_item(retlist, i, k);
318          exit;
319       end if;
320     end loop;
321   end loop;
322 
323   if dedupe then
324     m := 1;
325     maxi := retlist.count;
326     while m < maxi loop
327       if retlist(m) = retlist(m+1) then
328         for n in m .. maxi-1 loop
329           retlist(n) := retlist(n+1);
330         end loop;
331         retlist.delete(maxi);
332         maxi := maxi - 1;
333       else
334         m := m + 1;
335       end if;
336     end loop;
337   end if;
338 
339   return retlist;
340 end;
341 
342 -- call with indexname and word to get the root of
343 
344 function get_root_no_cache (idx_name varchar2, term varchar2)
345 return varchar2 is
346   exid number;
347   retval varchar2(2000);
348 begin
349 
350 --  dbms_output.put_line('root for '||term);
351   begin
352     ctx_query.explain (
353        index_name => idx_name,
354        text_query => '$'||term,
355        explain_table => 'ec_ana_explain',
356        sharelevel    => 0,    -- do share
357        explain_id    => 1 );
358   exception
359     when imt_error then
360        return term;
361   end;
362   begin
363     execute immediate (
364     ' select object_name from ec_ana_explain ' ||
365     ' where explain_id = 1 '               ||
366     ' and position = 1 '                   ||
367     ' and parent_id = 1' ) into retval;
368   exception when no_data_found then
369     retval := term;
370   end;
371 
372   execute immediate ('truncate table ec_ana_explain');
373 
374 --  dbms_output.put_line('is '||retval);
375   return retval;
376 
377 end;
378 
379 
380 function get_root (idx_name varchar2, term varchar2)
381 return varchar2 is
382   exid number;
383   retval varchar2(2000);
384 
385 begin
386 
387   begin
388     execute immediate (
389       'select root from ec_ana_roots where token = :t1')
390       into retval using term;
391 
392   exception when no_data_found then
393     begin
394       ctx_query.explain (
395          index_name => idx_name,
396          text_query => '$'||term,
397          explain_table => 'ec_ana_explain',
398          sharelevel    => 0,    --  do share
399          explain_id    => 1 );
400     exception
401       when imt_error then
402          execute immediate('insert into ec_ana_roots (token, root) values (:t1, :t2)')
403            using term, retval;
404          return term;
405     end;
406 
407     begin
408       execute immediate (
409       ' select object_name from ec_ana_explain ' ||
410       ' where explain_id = 1 '               ||
411       ' and position = 1 '                   ||
412       ' and parent_id = 1' ) into retval;
413     exception when no_data_found then
414       retval := term;
418     execute immediate('insert into ec_ana_roots (token, root) values (:t1, :t2)')
415     end;
416     execute immediate ('truncate table ec_ana_explain');
417 
419       using term, retval;
420 
421     return retval;
422 
423   end;
424 
425   return retval;
426 
427 end;
428 
429 function normalize_list (idx_name varchar2, inlist in word_vector)
430 return word_vector is
431    retlist  word_vector;
432 begin
433    retlist := word_vector();
434    for i in 1 .. inlist.count loop
435      retlist.extend;
436      if is_a_stopword(inlist(i)) then
437        retlist(i) := inlist(i);
438      else
439        retlist(i) := get_root(idx_name, inlist(i));
440      end if;
441    end loop;
442    return retlist;
443 end;
444 
445 function remove_stopwords (inlist in word_vector)
446 return word_vector is
447    cntr     integer := 1;
448    retlist  word_vector;
449 begin
450    retlist := word_vector();
451    for i in 1 .. inlist.count loop
452      if (not (is_a_stopword(inlist(i))) ) then
453        retlist.extend;
454        retlist(cntr) := inlist(i);
455        cntr := cntr + 1;
456      end if;
457    end loop;
458    return retlist;
459 end;
460 
461 -- get phrases
462 
463 -- from a list of words, produce a list of phrases, defined as any
464 -- contiguous list of non-stopwords, up to max_words in length.
465 
466 function get_phrases (inlist word_vector, max_words integer)
467 return word_vector is
468   p integer;
469   phrase varchar2(2000);
470   phrase1 varchar2(2000);
471   space  varchar2(1);
472   cntr integer := 1;
473   inlistCount integer := 0;
474   mw integer := 0;
475   retlist word_vector;
476   newInlist word_vector;
477 
478 begin
479   inlistCount:= inlist.count;
480   mw := max_words;
481   retlist := word_vector();
482  -- newInlist := word_vector();
483  newInlist := inlist;
484 
485  newInlist := remove_stopwords(newInlist);
486 
487  for i in 1 .. newInlist.count loop
488     --if not is_a_stopword(inlist(i)) then
489        space := '';
490        phrase := '';
491        p := 0;
492        while (p <= mw-1 and i+p <= newInlist.count) loop
493        --if (wildCard) OR (wildFirst)  then -- Check for and remove all wild card character from the phrase
494        --      dbms_output.put_line('in get_phrases  wild = true');
495        --      phrase1 := IEM_PARSER_PVT.remove_wild(newInlist(i+p));
496        --      phrase := phrase || space || phrase1 ;
497        --else
498            phrase := phrase || space || newInlist(i+p);
499        --end if;
500        if p = mw-1 then
501              retlist.extend;
502              retlist(cntr) := phrase;
503              --dbms_output.put_line('phrase: '|| phrase);
504              --dbms_output.put_line('');
505              cntr := cntr + 1;
506          end if;
507          space := ' ';
508          p := p+1;
509        end loop;
510     --end if;
511   end loop;
512   return retlist;
513 end;
514 
515 function parse_string (stringToParse in varchar2)
516 return word_vector is
517    ws       varchar2(32767);        -- workspace
518    theword  varchar2(2000);
519    p        integer;
520    cntr     integer := 1;
521    rettab   word_vector;
522 
523    c        integer := 0;
524 
525 begin
526 
527    rettab := word_vector();  -- initialize
528 
529    ws := stringToParse;
530 
531    -- translate all punctuation into spaces
532 
533   --  ws := translate (ws, '`!"$%^&*()_+{}:@~|<>?-=[];''#\,./',
534   --                      '                                 ');
535   -- Don't remove %,@,.,#,?,'and comma chars, use to denote wild card or email address.
536   -- If changed, be sure to add to the is_num function
537 
538 -- Commited out for R12 patch 4417790 issue
539 
540       --ws := translate (ws, '`!"$^&*()_+{}~|<>-=;\,/',
541                            '                               ');
542 
543    -- and whitespace control chars
544    ws := translate (ws, fnd_global.local_chr(10)||fnd_global.local_chr(11)||fnd_global.local_chr(12)||fnd_global.local_chr(13), '    ');
545 
546    -- upper case it all
547    ws := translate (ws, 'abcdefghijklmnopqrstuvwxyz',
548                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ');
549 
550    -- remove multiple spaces
551    while (instr(ws, '  ', 1) > 0) loop
552      ws := replace (ws, '  ', ' ');
553    end loop;
554 
555    -- and any leading or trailing space
556    if (substr(ws, 1, 1) = ' ') then
557      ws := substr(ws, 2, length(ws)-1);
558    end if;
559 
560    if (substr(ws, length(ws), 1) = ' ') then
561      ws := substr(ws, 1, length(ws)-1);
562    end if;
563 
564    theword := substr(ws, 280, 40);
565 
566    p := instr(ws, ' ');
567 
568    while p > 0 loop
569 
570       c := c+1;
571 
572       theword := substr(ws, 1, p-1);
573 
574       -- save word. Simply discard it if too long
575       if length(theword) <= MAXWORDLENGTH then
576         rettab.extend;
577         rettab(cntr) := theword;
578         cntr := cntr + 1;
579       end if;
580 
581       ws := substr(ws, p+1, length(ws)-p);
582       p := instr(ws, ' ');
583 
584    end loop;
585 
586    theword := ws;
587    if length(theword) > 0 then
588      rettab.extend;
589      rettab(cntr) := theword;
590    end if;
591 
592    return rettab;
593 end;
594 
595 -- compute the word vector
599   document         in varchar2,
596 
597 function compute_vector (
598   idx_name         in varchar2, --DEFAULT fnd_api.g_miss_char,
600   analyze_length   in integer default 5000,
601   window_size      in integer default 3
602   )
603 return word_vector is
604   maxlen   integer;
605   ws       integer;
606   thedoc   varchar2(32767);
607   retlist  word_vector;
608   t1 number; t2 number; t3 number; t4 number; t5 number;
609 begin
610 
611   if analyze_length is null then
612     maxlen := 5000;
613   else
614     maxlen := analyze_length;
615   end if;
616 
617   maxlen := least(maxlen, 32767);
618 
619   if window_size is null then
620     ws := 3;
621   else
622     ws := window_size;
623   end if;
624 
625   if length(document) > maxlen then
626     thedoc := substr(document, 1, maxlen);
627   else
628     thedoc := document;
629   end if;
630 
631   init(idx_name);
632 
633 --t1 := dbms_utility.get_time;
634 
635   retlist := parse_string(thedoc);
636   --dbms_output.put_line ('This is retlist after parse_string :'||  retlist(1));
637 
638 --t2 := dbms_utility.get_time;
639 --dbms_output.put_line('parse     ' || to_char((t2-t1)/100));
640 
641   --retlist := normalize_list ( idx_name, retlist );
642 
643 --t3 := dbms_utility.get_time;
644 -- dbms_output.put_line('Normalise ' || to_char((t3-t2)/100));
645 
646   --dbms_output.put_line ('Just before get_phrases');
647   --dbms_output.put_line ('This is retlist.count :'||  retlist.count);
648 
649   retlist := get_phrases    ( retlist, ws );
650 
651   --dbms_output.put_line ('Just after get_phrases');
652   --dbms_output.put_line ('This is retlist.count :'||  retlist.count);
653 
654 --t4 := dbms_utility.get_time;
655 --dbms_output.put_line('phrase    ' || to_char((t4-t3)/100));
656 
657   -- retlist := sort_list      ( retlist, true );
658   retlist := sort_list      ( retlist, false ); -- Don't remove duplicates
659 
660  --dbms_output.put_line ('This is retlist after sort :'||  retlist);
661  --for i in 1 .. retlist.count loop
662      --dbms_output.put_line ('This is retlist after sort :'||  retlist(i));
663  --end loop;
664 
665 --t5 := dbms_utility.get_time;
666 --dbms_output.put_line('Normalise ' || to_char((t3-t2)/100)||' sort ' || to_char((t5-t4)/100));
667 
668   return retlist;
669 end;
670 
671 -- same function for clobs
672 
673 function compute_vector (
674   idx_name         in varchar2,
675   document         in clob,
676   analyze_length   in integer default 5000,
677   window_size      in integer default 3
678   )
679 return word_vector is
680   retlist   word_vector;
681   maxlen    integer;
682   thestrng  varchar2(32767);
683   theoffset integer;
684   thesize   integer;
685   thebuff   varchar2(32767);
686 begin
687 
688   if analyze_length is null then
689     maxlen := 5000;
690   else
691     maxlen := analyze_length;
692   end if;
693 
694   maxlen := least(maxlen, 32767);
695 
696   theoffset := 1;
697   thestrng  := ' ';
698   while (true) loop
699 
700     begin
701       thesize := maxlen;
702       dbms_lob.read(
703         lob_loc    => document,
704         amount     => thesize,
705         offset     => theoffset,
706         buffer     => thebuff );
707     exception
708      when no_data_found then
709       exit;
710     end;
711 
712     exit when (thesize <= 0);
713     theoffset := thesize+1;
714 
715     if length(thestrng) + thesize < maxlen then
716       thestrng := thestrng || thebuff;
717     else
718       thestrng := thestrng || substr(thebuff, 1, maxlen-length(thestrng)-1);
719       exit;
720     end if;
721 
722   end loop;
723 
724   if length(thestrng) + thesize < maxlen then
725     thestrng := thestrng || thebuff;
726   else
727     thestrng := thestrng || substr(thebuff, 1, maxlen-length(thestrng)-1);
728   end if;
729 
730   retlist := compute_vector (idx_name, thestrng, maxlen, window_size);
731 
732   return retlist;
733 end;
734 
735 -- compare two lists of phrases
736 -- resulting similarity quotient is based on number of shared phrases,
737 -- reduced according to length of each vector.
738 
739 function compare_vectors (inlist1 in word_vector, inlist2 in word_vector)
740 return  word_vector is
741   result_array 		word_vector;
742   matchcnt 		integer := 0;
743   j  			integer := 1;
744   k  			integer := 1;
745   l_Error_Message	varchar2(200);
746   INVALID_WORD_VECTOR	EXCEPTION;
747 begin
748 
749   result_array := word_vector();  -- initialize
750 
751 
752   if inlist1 is null or inlist2 is null then
753      RAISE INVALID_WORD_VECTOR;
754   end if;
755 
756   if inlist1.count = 0 or inlist2.count = 0 then
757      RAISE INVALID_WORD_VECTOR;
758   end if;
759 
760 
761    while (j <= inlist1.count and k <= inlist2.count) loop
762      while (k <= inlist2.count and j <= inlist1.count) loop
763 
764       -- Don't count the wild # as a direct match in string when it is not quoted
765       if (inlist1(j) = inlist2(k)) and not (instr(inlist2(k), '#') > 0)  then
766           --dbms_output.put_line('***** match on in compare vector ***** '|| inlist1(j));
767           matchcnt := matchcnt + 1;
768           --dbms_output.put_line('***** matchcnt ***** '|| matchcnt);
769           result_array.extend;
770           result_array(matchcnt) :=  inlist1(j);
774       elsif  (instr(inlist2(k), '%') > 0) then
771           --dbms_output.put_line('***** result_array(matchcnt) ***** '|| result_array(matchcnt));
772           j := j+1;
773           exit;
775           --dbms_output.put_line('---------------------------');
776           --dbms_output.put_line('---------------------------');
777           --dbms_output.put_line('At wild % in compare_vec');
778           --dbms_output.put_line('This is inlist1(j)' || inlist1(j));
779           --dbms_output.put_line('This is inlist2(k)' || inlist2(k));
780           if (test_match(inlist2(k), inlist1(j))) then
781               --dbms_output.put_line('match on %  in compare_vectors '|| inlist1(j));
782               matchcnt := matchcnt + 1; -- update match count and move on
783               result_array.extend;
784               result_array(matchcnt) :=  inlist1(j);
785               j := j+1;
786               exit;
787           else -- just move to the next word
788               j := j+1;
789               exit;
790           end if;
791       elsif  (instr(inlist2(k), '#') > 0) then
792           --dbms_output.put_line('---------------------------');
793           --dbms_output.put_line('---------------------------');
794           --dbms_output.put_line('At wild # in compare_vec');
795           --dbms_output.put_line('This is inlist1(j)' || inlist1(j));
796           --dbms_output.put_line('This is inlist2(k)' || inlist2(k));
797           if (test_match_single(inlist2(k), inlist1(j))) then
798               --dbms_output.put_line('match on is_num in compare_vectors '|| inlist1(j));
799               matchcnt := matchcnt + 1; -- update match count and move on
800               result_array.extend;
801               result_array(matchcnt) :=  inlist1(j);
802               j := j+1;
803               exit;
804           else -- just move to the next word
805               j := j+1;
806               exit;
807           end if;
808       elsif  (instr(inlist2(k), '?') > 0) then
809           --dbms_output.put_line('---------------------------');
810           --dbms_output.put_line('---------------------------');
811           --dbms_output.put_line('At wild ? in compare_vec');
812           --dbms_output.put_line('This is inlist1(j)' || inlist1(j));
813           --dbms_output.put_line('This is inlist2(k)' || inlist2(k));
814           if (test_match_single(inlist2(k), inlist1(j))) then
815               --dbms_output.put_line('match on NOT is_num in compare_vectors '|| inlist1(j));
816               matchcnt := matchcnt + 1; -- update match count and move on
817               result_array.extend;
818               result_array(matchcnt) :=  inlist1(j);
819               j := j+1;
820               exit;
821           else -- just move to the next word
822               j := j+1;
823               exit;
824           end if;
825       elsif (inlist2(k) > inlist1(j)) or (wildCard) then
826         j := j+1;
827         --dbms_output.put_line('j+1 is  at k > j: '|| j);
828         --dbms_output.put_line('This is inlist1(j) at k > j: ' || inlist1(j));
829         --dbms_output.put_line('This is inlist2(k) at k > j: ' || inlist2(k));
830         exit;
831       end if;
832       k := k+1;
833     end loop;
834   end loop;
835   --dbms_output.put_line('Matches   : '||to_char(matchcnt));
836   --dbms_output.put_line('Word Count: '||to_char(inlist1.count+inlist2.count));
837   --dbms_output.put_line('Mail body word count: '||to_char(inlist1.count));
838   --dbms_output.put_line('sqrt      : '||to_char(sqrt(sqrt(inlist1.count+inlist2.count))));
839   --dbms_output.put_line(inlist1.count+inlist2.count);
840  wildCard := false; -- reset global variable
841       --dbms_output.put_line (' --------- wildCard set to false ----------- ');
842   --return (matchcnt*1.0)/sqrt(sqrt((inlist1.count+inlist2.count)))*100;
843   --return ((matchcnt*1.0)/(inlist1.count))*100;
844   --return ((matchcnt*1.0)/(inlist2.count))*100;
845     return result_array;
846     EXCEPTION
847         WHEN INVALID_WORD_VECTOR THEN
848         FND_MESSAGE.SET_NAME('IEM','IEM_INVALID_WORD_VECTOR');
849         l_Error_Message := FND_MESSAGE.GET;
850         fnd_file.put_line(fnd_file.log, l_Error_Message);
851 
852 end;
853 
854 procedure p_themes (
855    index_name      in  varchar2,
856 --   textkey         in  varchar2,
857 --   restab          in out  ctx_doc.theme_tab,
858    restab       in out NOCOPY iem_im_wrappers_pvt.theme_table,
859    full_themes     in  boolean default true,
860    num_themes      in  number default 32,
861    document        in  varchar2,
862    pictograph      in boolean default false
863    )
864 is
865 --  thedoc           clob;
866   wv               word_vector;
867 begin
868 
869 
870 --  ctx_doc.filter(
871 --      index_name => index_name,
872 --      textkey    => textkey,
873 --      restab     => thedoc,
874 --      plaintext  => true );
875 
876   wv := IEM_PARSER_PVT.compute_vector(
877 --      idx_name       => 'acr_resp_index',
878 idx_name       => index_name,
879       document       => document,
880       analyze_length => DOCLENGTH,
881       window_size    => WINDOWSIZE );
882 
883   for i in 1 .. wv.count loop
884     IF wv(i) <>' ' then
885     	restab(i).theme  := wv(i);
886     	restab(i).weight := 1;
887     END IF;
888   end loop;
889 
890 end;
891 
892 function get_window_size (search_string varchar2)
893 return integer is
894 
895 INVALID_LOB              EXCEPTION;
896 ws       varchar2(32767);        -- workspace
897 theword  varchar2(2000);
898 p        integer;
899 cntr     integer := 1;
900 rettab   word_vector;
901 window_size integer := 0;
902 
903 BEGIN
904 
905 --  Find the window size/number of words in the search phrase
906 
910      ws := replace (ws, '  ', ' ');
907    ws := search_string;
908 -- remove multiple spaces
909    while (instr(ws, '  ', 1) > 0) loop
911    end loop;
912 
913    -- and any leading or trailing space
914    if (substr(ws, 1, 1) = ' ') then
915      ws := substr(ws, 2, length(ws)-1);
916    end if;
917 
918    if (substr(ws, length(ws), 1) = ' ') then
919      ws := substr(ws, 1, length(ws)-1);
920    end if;
921 
922    theword := substr(ws, 280, 40);
923 
924    p := instr(ws, ' ');
925 
926    while p > 0 loop
927     window_size := window_size+1;
928 
929       ws := substr(ws, p+1, length(ws)-p);
930 
931       p := instr(ws, ' ');
932 
933    end loop;
934    window_size := window_size+1;  -- Final window/word count
935    return window_size;
936 END; -- get_window_size
937 
938 function remove_wild (mask varchar2)
939 return varchar2 is
940 
941 newMask  varchar2(2000);
942 l_count     integer := 0;
943 maskLength integer := 0;
944 
945 BEGIN
946     l_count := instr(mask, '%');
947     maskLength := length(mask);
948     if l_count > 1 then -- % is at the end of the word
949         newMask  := substr(mask, 1, maskLength -1); -- Remove % form the wnd of the mask
950         return newMask;
951     elsif l_count = 1 then -- % is at the begining of the word
952         newMask  := substr(mask, 2, maskLength -1);
953         return newMask;
954    end if;
955    return mask;
956 END;
957 
958 function test_match (mask varchar2, testToken varchar2)
959 return boolean is
960 
961 maskLength integer := 0;
962 testTokenLength integer := 0;
963 
964  matchcnt integer := 0;
965   m  integer := 1;
966   n  integer := 1;
967   literalChar boolean := false;
968 
969 BEGIN
970 
971   maskLength := length(mask);
972   testTokenLength := length(testToken);
973 
974   if mask is null or testToken is null then
975     return false;
976   end if;
977 
978   if maskLength = 0 or testTokenLength = 0 then
979     return false;
980   end if;
981 
982   while (m <= maskLength and n <= testTokenLength) loop
983       --dbms_output.put_line('-------------------------------------');
984       --dbms_output.put_line('substr(mask,m,1) '|| substr(mask,m,1));
985       --dbms_output.put_line('substr(testToken,n,1) '|| substr(testToken,n,1));
986 
987       if (substr(mask,m,1) = fnd_global.local_chr(39)) then
988           --dbms_output.put_line('At chr(39)');
989           m := m+1; -- move to the next character in the mask
990           --dbms_output.put_line('substr(mask,m,1) '|| substr(mask,m,1));
991           --dbms_output.put_line('=-=-=-=-=-=-=-=-=-=-=-=-=-=');
992           matchcnt := matchcnt + 1; --  count this as a match.
993           if  (literalChar = false) then
994               literalChar := true; --flip the literal character flag
995           else
996               literalChar := false;
997          end if;
998       elsif (substr(mask,m,1) = '#' AND NOT literalChar) then -- We need to see if the current testToken char is anumber
999           if (is_num(substr(testToken,n,1))) then
1000               m := m+1; -- move to the next character in the mask
1001               matchcnt := matchcnt + 1; --  this is a match.
1002               n := n+1; -- move to the next testToken char
1003               --dbms_output.put_line('(substr(mask,m,1) TRUE '|| substr(mask,m,1));
1004           else
1005               --dbms_output.put_line('substr(mask,m,1) FALSE '|| substr(mask,m,1));
1006               return false; -- This is not a match for this token
1007           end if;
1008       elsif (substr(mask,m,1) = '?' AND NOT literalChar) then -- skip this in the mask and count it as a
1009           if (NOT(is_num(substr(testToken,n,1)))) then
1010               m := m+1; -- move to the next character in the mask
1011               matchcnt := matchcnt + 1; --  this is a match
1012               n := n+1; -- move to the next testToken char
1013           else
1014               return false; -- This is not a match for this token
1015           end if;
1016       elsif (substr(mask,m,1) = '%' AND NOT literalChar) then -- test if testToken has a match for then next mask character
1017                                      -- anywhere in it's string.
1018           m := m+1; -- move to the next character in the mask
1019           matchcnt := matchcnt + 1; -- have the wild card march.
1020           n := n+1; -- The % will match any single character in the testToken string
1021 
1022           while (m <= maskLength and n <= testTokenLength) loop
1023               if substr(mask,m,1) = substr(testToken,n,1) then
1024                   --dbms_output.put_line('match on new and improved Wild '|| substr(testToken,n,1));
1025                   matchcnt := matchcnt + 1;
1026                   m := m+1;
1027                   n := n+1;
1028               else
1029                   -- Need to take into account the single quote
1030                   if (substr(mask,m,1) = '%' OR substr(mask,m,1) = '#' OR  substr(mask,m,1) = '?' OR substr(mask,m,1) = fnd_global.local_chr(39)) then
1031                       exit; -- leave inner loop and continue processing strings
1032                   else
1033                       n :=n+1;  -- chars don't match so move n along
1034                  end if;
1035               end if;
1036           end loop;
1037       elsif substr(mask,m,1) = substr(testToken,n,1) then
1038           --dbms_output.put_line('match on from new test_match: '|| substr(testToken,n,1));
1039           matchcnt := matchcnt + 1; -- move on in mask and testToken
1040           m := m+1;
1041           n := n+1;
1042       else  -- not a match so return false
1043            --dbms_output.put_line('At last else -- returnig false at New test_match');
1044           return false;
1045       end if;
1049 
1046     end loop;
1047 
1048     --dbms_output.put_line('matchLength and matchcnt '|| maskLength ||' '|| matchcnt);
1050     if maskLength = matchcnt then
1051         return true;
1052     else
1053         return false;
1054     end if;
1055 END; -- test_match
1056 
1057 function test_match_single (mask varchar2, testToken varchar2)
1058 return boolean is
1059 
1060 maskLength integer := 0;
1061 testTokenLength integer := 0;
1062 
1063  matchcnt integer := 0;
1064   m  integer := 1;
1065   n  integer := 1;
1066   literalChar boolean := false;
1067 
1068 BEGIN
1069 
1070   maskLength := length(mask);
1071   testTokenLength := length(testToken);
1072 
1073   if mask is null or testToken is null then
1074     return false;
1075   end if;
1076 
1077   if maskLength = 0 or testTokenLength = 0 then
1078     return false;
1079   end if;
1080 
1081   while (m <= maskLength and n <= testTokenLength) loop
1082       --dbms_output.put_line('-------------------------------------');
1083       --dbms_output.put_line('substr(mask,m,1) '|| substr(mask,m,1));
1084       --dbms_output.put_line('substr(testToken,n,1) '|| substr(testToken,n,1));
1085 
1086       if (substr(mask,m,1) = fnd_global.local_chr(39)) then
1087           --dbms_output.put_line('At chr(39)');
1088           m := m+1; -- move to the next character in the mask
1089           --dbms_output.put_line('substr(mask,m,1) '|| substr(mask,m,1));
1090           --dbms_output.put_line('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<');
1091           if  (literalChar = false) then
1092               literalChar := true; --flip the literal character flag
1093           else
1094               literalChar := false;
1095          end if;
1096 
1097       elsif( substr(mask,m,1) = '#'  AND NOT literalChar) then -- test if testToken has an number in it's corresponding position
1098             if (is_num(substr(testToken,n,1))) then
1099                --dbms_output.put_line('match on from is_num: TRUE '|| substr(testToken,n,1));
1100                matchcnt := matchcnt + 1;
1101                m := m+1;
1102                n := n+1;
1103             else -- testToken position does not contain an number
1104                 --dbms_output.put_line('match on from is_num: FALSE '|| substr(testToken,n,1));
1105                 return false;
1106             end if;
1107       elsif (substr(mask,m,1) = '?'  AND NOT literalChar)then -- test if testToken has an number in it's corresponding position
1108             if (not(is_num(substr(testToken,n,1)))) then
1109                --dbms_output.put_line('match on NOT num from is_num: '|| substr(testToken,n,1));
1110                matchcnt := matchcnt + 1;
1111                m := m+1;
1112                n := n+1;
1113             else -- testToken position does not contain an number
1114                 return false;
1115             end if;
1116       elsif substr(mask,m,1) = substr(testToken,n,1) then
1117           --dbms_output.put_line('match on from test_match_single: '|| substr(testToken,n,1));
1118           matchcnt := matchcnt + 1; -- move on in mask and testToken
1119           m := m+1;
1120           n := n+1;
1121       else  -- not a match so return false
1122            --dbms_output.put_line('At last else -- returnig false at test_mask_single');
1123           return false;
1124       end if;
1125     end loop;
1126 
1127     --dbms_output.put_line('matchLength and matchcnt '|| maskLength ||' '|| matchcnt);
1128 
1129     if testTokenLength = matchcnt then
1130         return true;
1131     else
1132         return false;
1133     end if;
1134 END; -- test_match_single
1135 
1136 Function is_num (s1 varchar2)
1137   RETURN  boolean IS
1138 BEGIN
1139     IF
1140 INSTR(TRANSLATE(upper(s1),'ABCDEFGHIJKLMNOPQRSTUVWXYZ%@.#?,','$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'),'$') = 0 THEN
1141        RETURN true; -- No characters where translated so s1 is a number
1142     ELSE
1143        RETURN false; -- Characters were translated to $ so s1 was not a number`
1144     END IF;
1145 END; -- is_num
1146 
1147 procedure test is
1148   vec1 		word_vector;
1149   vec2 		word_vector;
1150   result_vec	word_vector;
1151   doc  		varchar2(4000);
1152 begin
1153 
1154   doc := 'the QUICK brown Fox because JUMPS o''er the LAZy DOG. The Dogs are LAUGHING to see such foxes AND the
1155 fox-cow JUMPED over the moon because he could';
1156 -- fails
1157 --  doc := 'the QUICK brown Fox because JUMPS over the LAZy DOG The Dogs';
1158 -- ok
1159 --  doc := 'aa BB cc dd ee ff gg hh ii jj kk ll mm nn oo pp qq rr ss tt uu vv ww xx yy zz';
1160 --??
1161     doc := 'the QUICK brown Fox because JUMPS
1162 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa over the LAZy DOG the p qq rr
1163 ss tt uu vv ww xx yy zz';
1164 
1165   vec1 := IEM_PARSER_PVT.compute_vector(
1166     idx_name => 'explain_ex_text',
1167     document => doc,
1168     window_size => 3);
1169 
1170 --dbms_output.put_line('----------');
1171 
1172   vec2 := IEM_PARSER_PVT.compute_vector(
1173     idx_name => 'explain_ex_text',
1174     document => 'how now red fox. The quick quacking quick brown fox is a lazy dog',
1175     window_size => 3);
1176 
1177   result_vec := IEM_PARSER_PVT.compare_vectors(vec1, vec2);
1178 
1179   --dbms_output.put_line('Similarity %age is : ' || to_char(round((result_vec.count), 2)));
1180 
1181 end;
1182 
1183 */
1184 END IEM_PARSER_PVT;