(*
* lablai - An ML Artificial Inteligence library
* Copyright (C) 2006 Till Crueger
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*)
(* File $RCSfile$ *)
(* last edited by $Author: till_crueger $ *)
(* $Date: 2007-12-15 19:53:03 +0100 (Sa, 15 Dez 2007) $, $Revision: 29 $*)
(* a general naive bayes knowledge *)
(* consist of a category; a count that indicates how often this category occured
and a array of Lists, indicating the probability of each value for a given attribute*)
(* the position in the array indicates the attribute type *)
type 'a knowledge = {kn_category : 'a ; kn_count : int ; kn_data : (Types.attribute * float) list array };;
(* Bias for attributes about which nothing is known *)
let bias = 0.1;;
(* turn a simple count into probapbilities *)
let divide (x : (Types.attribute * int) list array) (total : int) =
(* divide each count by the total*)
let divider (cat,count) = (cat,(float_of_int count) /. (float_of_int total)) in
Array.map (List.rev_map divider) x
;;
let learn examples =
(* sort examples according to the categories *)
let sorted = Types.sort_by_cat examples in
(* count the occurence of each category
an the distribution of the attributes in each category *)
let rec loop categories akku =
match categories with
(cat,data) :: rest ->
(* get all the attribute counts in a category *)
let counted = Types.count data in
(* turn the counts into probapbilities *)
let probs = divide counted (List.length data) in
let result = {kn_category = cat; kn_count = List.length data; kn_data = probs} in
loop rest (result :: akku)
| [] -> akku
in
loop sorted []
;;
let calculate_prob knowl data =
(* get all the probapbilities for the given data*)
let retrieve probs attrib =
try
List.assoc attrib probs
with Not_found ->
(* if we know nothing, we use the bias *)
bias
in
let probs = Array.mapi (fun i -> retrieve knowl.kn_data.(i)) data in
(* calculate the overall probapbility *)
(List.fold_left ( *. ) 1.0 (Array.to_list probs)) *. (float_of_int knowl.kn_count)
;;
let evaluate knowls data =
(* find the category with highest probapbility *)
let rec loop knowls value akku =
match knowls with
[] -> akku
| knowl :: knowls ->
(* calculate the current probapbility *)
let prob = calculate_prob knowl data in
(* compare with current best *)
if prob >= value then
loop knowls prob knowl.kn_category
else
loop knowls value akku
in
loop knowls ~-.10.0 ((List.hd knowls).kn_category)
;;
let print_knowledge print_cat knowl =
print_string "{\n category=";
print_cat knowl.kn_category;
print_string "\n count=";
print_int knowl.kn_count;
print_string "\n attributes=\n [\n";
let print_probs n (attr,prob) =
print_string " P(D";
print_int n;
print_string " = ";
Types.print_attribute attr;
print_string ")= ";
print_float prob;
print_string "\n"
in
Array.iteri (fun n -> List.iter (print_probs n)) knowl.kn_data;
print_string " ]\n}\n"
;;
(*
* $Log$
* Revision 1.4 2007/12/15 18:52:57 till_crueger
* - Updated documentation
* - Moved Log-Tags to a better position in the sources
*
* Revision 1.3 2006/03/26 17:58:24 till_crueger
* Major code cleanup and improvment of the documentation.
*
* Revision 1.2 2006/02/20 20:21:46 till
* Added LGPL to all files
* Added LGPL to package
*
* Revision 1.1 2006/02/20 16:15:02 till
* Began work on naive Bayes learner
*
*)