In this new OCaml session I am going to implement the OCaml cookbook recipe for the problem of computing the nearest neighbors of a data set.
Nearest neighbors
It is a problem that can be formulated within applied statistics; so, let us imagine N elements in a space (which I will be able to represent by vectors) that are samples of a multidimensional random variable. Let us imagine a reference element (often it is an element of this set). Which are the elements of the set closest to this reference? The initial answer is that it depends on how we define the distance between two elements (which in mathematics is called a metric). A distance is a function that given two elements of this space returns the distance between them, but it must have a number of properties to qualify as a distance.
Let's consider that the elements are vectors with three elements (with real values). In our case we make use of the fact that the elements will be part of a three-dimensional real vector space and the most usual distance is called Euclidean distance which is the one we will implement, but our definition of nearest neighbors (as a function) will try to have the definition of distance among its arguments and not predetermined.
Operations
Since we are going to use a Euclidean distance between two vectors (which will be two arrays
whose elements are floats
) then its definition will be:
$d(\bo x,\bo y)=√(∑↙{i=0}↖3 (x_i-y_i)^2)$
but this is analogous to saying that,
$d(\bo x, \bo y)=√[(\bo x- \bo y)∙(\bo x-\bo y)]$
where $∙$ is the dot product of two vectors (which we represent as $\bo x$ and $\bo y$). So, let's see how these definitions look like in OCaml using the Array
module of the standard library:
(* Operator `*@` is the typical dot product between vectors, in this case with floats elements.*)
let ( *@ ) u v = Array.map2 ( *. ) u v |> Array.fold_left ( +. ) 0.
(* Operator `*!` defines multiplication of a scalar (float) by a vector *)
let ( *! ) a v = Array.map ( fun x -> a *. x ) v
(* Operator `+:` define the sum of two vectors *)
let ( +: ) u v = Array.map2 ( +. ) u v
(* Operator `-:` is introduced for convenience/simplicity in the `metric_p2`, we could also define
it directly as `let ( -: ) u v = Array.map2 ( +. ) u ( -1.0 *! v )` *)
let ( -: ) u v = u +: (-1.0 *! v)
(* `metric_p2` is the Euclidean distance based on the p-norm with p=2, it is the most typical distance. *)
let metric_p2 u v = sqrt (( u -: v ) *@ ( u -: v) )
Distances of a set of points
As we said we have to calculate the distances between a reference element and a set of samples of a random variable, we define a function distance_in_set
:
(* `distance_in_set` calculates all the distances in a set of elements `s` with respect to a reference
element `ref` using a `metric` function. It returns a tuple `int * float` for the index of the element
in the set and the distance of that element with respect to the reference. *)
let distance_in_set ref s metric =
let n = Array.length s in
Array.init n (fun i -> (i, metric ref (Array.get s i)) )
As we see ref
is our reference element, s
is the sample that owns the set of elements to evaluate, and distance
is the function that calculates the distances between elements (vectors). We have chosen to have this function return a pair (tuple) of data: the index of the element in our dataset (the dataset will be an array of an array), and the distance from that element to our reference.
The second part is to sort this set based on the value of the distances:
(* `sort_by_distance` will sort the output of the `distance_in_set` with in-place replacement, so the
indices of the k nearest neighbours will be the k first elements of the array returned by this function *)
let sort_by_distance (d:(int * float) array) =
let cmd x y =
if (snd x -. snd y) = 0.0 then 0
else if (snd x -. snd y) > 0.0 then 1
else -1
in
Array.sort cmd d
Thus, obtaining the nearest neighbor to a reference will be:
(* `get_nn s d` returns the nearest neighbour (first element of the sorted by distance set *)
let get_nn s d =
let nn_idx = fst (Array.get d 0) in
(nn_idx, Array.get s nn_idx)
Complete program
The complete program has some support functions to define elements of our sample space (in our case using a uniform distribution) and other auxiliary functions to print the results.
(* Operator `*@` is the typical dot product between vectors, in this case with floats elements.*)
let ( *@ ) u v = Array.map2 ( *. ) u v |> Array.fold_left ( +. ) 0.
(* Operator `*!` defines multiplication of a scalar (float) by a vector *)
let ( *! ) a v = Array.map ( fun x -> a *. x ) v
(* Operator `+:` define the sum of two vectors *)
let ( +: ) u v = Array.map2 ( +. ) u v
(* Operator `-:` is introduced for convenience/simplicity in the `distance_p2`, we could also define
it directly as `let ( -: ) u v = Array.map2 ( +. ) u ( -1.0 *! v )` *)
let ( -: ) u v = u +: (-1.0 *! v)
(* `metric_p2` is the Euclidean distance based on the p-norm with p=2, it is the most typical distance. *)
let metric_p2 u v = sqrt (( u -: v ) *@ ( u -: v) )
(* `distance_in_set` calculates all the distances in a set of elements `s` with respect to a reference
element `ref` using a `metric` function. It returns a tuple `int * float` for the index of the element
in the set and the distance of that element with respect to the reference. *)
let distance_in_set ref s metric =
let n = Array.length s in
Array.init n (fun i -> (i, metric ref (Array.get s i)) )
(* `sort_by_distance` will sort the output of the `distance_in_set` with in-place replacement, so the
indices of the k nearest neighbours will be the k first elements of the array returned by this function *)
let sort_by_distance (d:(int * float) array) =
let cmd x y =
if (snd x -. snd y) = 0.0 then 0
else if (snd x -. snd y) > 0.0 then 1
else -1
in
Array.sort cmd d
(* `get_nn s d` returns the nearest neighbour (first element of the sorted by distance set *)
let get_nn s d = let nn_idx = fst (Array.get d 0) in
(nn_idx, Array.get s nn_idx)
(* Supporting functions: we create a set of functions to sample in space points. The first is `sample_in_line`
which draws a point in a segment [x,y] for a uniform distribution (all points in a segment have the same
probability). We use the standard library, but there is a complete set of functions for that in Owl.Stats,
so in the case of `sample_in_line` we could use for example `Owl.Stats.uniform_rvs ~a:x ~b:y` *)
let sample_in_interval x y = (Random.float 1.0 ) *. (y -. x)
(* `sample_in_cube` draws an element in a cube (the position of the vector is within a cube), the element
(vector) will have 3 coordinates, so an array of length 3. *)
let sample_in_cube a b = [| (sample_in_interval a b) ; (sample_in_interval a b) ; (sample_in_interval a b) |]
(* `sample_set_in_cube` create n random elements of the space within a cube *)
let sample_set_in_cube n a b = Array.init n (fun _ -> (sample_in_cube a b))
(* Auxiliary functions: three functions to print our results. *)
let print_point v =
let s = Array.map (Printf.sprintf "%6f") v |> Array.to_list in
Printf.printf "[ %s ]\n" (String.concat ", " s)
let print_set s = Array.iter print_point s
let print_distance x =
let ind = Array.map (fst) x in
let dis = Array.map (snd) x in
let s = Array.map2 (Printf.sprintf "(index %d -> distance %6f)") ind dis |> Array.to_list in
Printf.printf "[ %s ]\n" (String.concat ", " s)
(* Testing the functions *)
let () = let ref = sample_in_cube (-10.0) 10.0 in
let _ = Printf.printf "Our reference point is:" in
let _ = print_point ref in
let cc = sample_set_in_cube 5 (-10.0) 10.0 in
let _ = Printf.printf "The set of points to evaluate is:\n" in
let _ = print_set cc in
let _ = Printf.printf "Their distances with respect to reference are:\n" in
let _ = print_distance (distance_in_set ref cc distance_p2) in
let ss = distance_in_set ref cc distance_p2 in
let _ = sort_by_distance ss in
let _ = Printf.printf "The sorted points by distance to reference are:\n" in
let _ = print_distance ss in
let _ = Printf.printf "Therefore the nearest neighbour is:\n" in
let nn = get_nn cc ss in
let _ = Printf.printf "Index: %d, Coordinates: " (fst nn) in
print_point (snd nn);;
Discussion
Perhaps the most difficult task is to understand the Array.map
, Array.map2
and Array.fold_left
functions to define our operators between vectors, and the Array.sort
function where it is key to be able to create a function that compares the second elements of a tuple. Otherwise, it is simply applying mathematical definitions in a straightforward way. I emphasize that we have defined inflix
operators when possible, something, I believe, simple/easy to do in OCaml.