twitter · johnynek · Nov 30, 2015 · Aug 3, 2015 · Aug 4, 2015 · Aug 4, 2015
diff --git a/algebird-core/src/main/scala/com/twitter/algebird/Approximate.scala b/algebird-core/src/main/scala/com/twitter/algebird/Approximate.scala
@@ -16,9 +16,13 @@ limitations under the License.
 
 package com.twitter.algebird
 
+private[algebird] trait ApproximateSet[T] {
+  def contains(t: T): ApproximateBoolean
+}
+
 // This gives an answer, and a LOWER BOUND on the probability that answer is
 // correct
-case class ApproximateBoolean(isTrue: Boolean, withProb: Double) {
+case class ApproximateBoolean(isTrue: Boolean, withProb: Double) extends ApproximateSet[Boolean] {
 
   def not: ApproximateBoolean = ApproximateBoolean(!isTrue, withProb)
 
@@ -58,6 +62,8 @@ case class ApproximateBoolean(isTrue: Boolean, withProb: Double) {
       ApproximateBoolean(false, newP)
     }
   }
+
+  def contains(b: Boolean): ApproximateBoolean = if (isTrue) this else not
 }
 
 object ApproximateBoolean {
@@ -67,7 +73,7 @@ object ApproximateBoolean {
 }
 
 // Note the probWithinBounds is a LOWER BOUND (at least this probability)
-case class Approximate[N](min: N, estimate: N, max: N, probWithinBounds: Double)(implicit val numeric: Numeric[N]) {
+case class Approximate[N](min: N, estimate: N, max: N, probWithinBounds: Double)(implicit val numeric: Numeric[N]) extends ApproximateSet[N] {
   require(numeric.lteq(min, estimate) && numeric.lteq(estimate, max))
 
   /**
@@ -113,10 +119,10 @@ case class Approximate[N](min: N, estimate: N, max: N, probWithinBounds: Double)
       this
     } else {
       val n = numeric
-      val ends = for (
-        leftv <- List(min, max);
+      val ends = for {
+        leftv <- List(min, max)
         rightv <- List(right.min, right.max)
-      ) yield n.times(leftv, rightv)
+      } yield n.times(leftv, rightv)
 
       val newProb = probWithinBounds * right.probWithinBounds
 
@@ -159,7 +165,7 @@ object Approximate {
   // Not a group/ring:
   // negate fails: x - x != 0, because with some probability the bound is bad.
   // distributive fails because a*b + a*c ignores that a is either in or out
-  // of the bound, and counts it idependently.
+  // of the bound, and counts it independently.
   implicit def monoid[N](implicit n: Numeric[N]): Monoid[Approximate[N]] = {
     // avoid capturing the Numeric:
     val z = Approximate.zero[N]

diff --git a/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala b/algebird-core/src/main/scala/com/twitter/algebird/HyperLogLog.scala
@@ -60,15 +60,15 @@ object HyperLogLog {
     buf
   }
 
-  implicit def int2Bytes(i: Int) = {
+  implicit def int2Bytes(i: Int): Array[Byte] = {
     val buf = new Array[Byte](4)
     ByteBuffer
       .wrap(buf)
       .putInt(i)
     buf
   }
 
-  implicit def long2Bytes(i: Long) = {
+  implicit def long2Bytes(i: Long): Array[Byte] = {
     val buf = new Array[Byte](8)
     ByteBuffer
       .wrap(buf)

diff --git a/algebird-test/src/main/scala/com/twitter/algebird/ApproximateProperty.scala b/algebird-test/src/main/scala/com/twitter/algebird/ApproximateProperty.scala
@@ -0,0 +1,127 @@
+package com.twitter.algebird
+
+import org.scalacheck.{ Gen, Prop, Properties, Test }
+import org.scalacheck.util.Pretty
+
+trait ApproximateProperty {
+  type Exact
+  type Approx
+  type Input
+  type Result
+
+  def exactGenerator: Gen[Exact]
+  def inputGenerator(e: Exact): Gen[Input]
+
+  def makeApproximate(e: Exact): Approx
+
+  def exactResult(e: Exact, i: Input): Result
+  def approximateResult(a: Approx, i: Input): ApproximateSet[Result]
+}
+
+object ApproximateProperty {
+
+  /**
+   *  Generates a list of exactly n Ts.
+   *  Useful because `Gen.listOfN(n, gen).sample` gives us Option[List[T]],
+   *  while we often want List[T].
+   */
+  private def genListOf[T](n: Int, gen: Gen[T]): List[T] = {
+    Gen.listOfN(n, gen).sample match {
+      case Some(xs) => xs
+      case _ => genListOf(n, gen)
+    }
+  }
+
+  private def successesAndProbabilities(a: ApproximateProperty, objectReps: Int, inputReps: Int): List[(Int, Double, List[String])] =
+    genListOf(objectReps, a.exactGenerator)
+      .flatMap { exact =>
+        val approx = a.makeApproximate(exact)
+        genListOf(inputReps, a.inputGenerator(exact)).flatMap { input =>
+          val approxResult = a.approximateResult(approx, input)
+          val exactResult = a.exactResult(exact, input)
+
+          val success = approxResult.contains(exactResult)
+          if (success.withProb == 0.0) {
+            None
+          } else {
+            val successInt = if (success.isTrue) 1 else 0
+            val messages = if (success.isTrue) List() else List(s"Exact result: $exactResult. Approx result: $approxResult.")
+            Some((successInt, success.withProb, messages))
+          }
+        }
+      }
+
+  def toProp(a: ApproximateProperty, objectReps: Int, inputReps: Int, falsePositiveRate: Double): Prop =
+    new Prop {
+      def apply(params: Gen.Parameters) = {
+        require(0 <= falsePositiveRate && falsePositiveRate <= 1)
+
+        val list = successesAndProbabilities(a, objectReps, inputReps)
+        val n = list.length
+
+        val monoid = implicitly[Monoid[(Int, Double, List[String])]]
+        val (successes, sumOfProbabilities, exacts) = monoid.sum(list)
+
+        // Computed from Hoeffding's inequality, might be inaccurate
+        // TODO Make sure this is correct
+        val diff = scala.math.sqrt(-n * scala.math.log(falsePositiveRate) / 2.0)
+
+        val success = if (successes >= (sumOfProbabilities - diff)) Prop.Proof else Prop.False
+
+        // Args that get printed when Scalacheck runs the test
+        val argsList: List[(String, String)] = {
+          val results = List(("Successes", s"$successes (out of $n)"),
+            ("Expected successes", "%.2f".format(sumOfProbabilities)),
+            ("Required successes", "%.2f".format(sumOfProbabilities - diff)))
+
+          val exampleFailures =
+            if (success == Prop.False)
+              List(("Example failures:\n  >", exacts.take(5).mkString("\n  >")))
+            else List()
+
+          val zeroProbTests = objectReps * inputReps - n
+          val testsReturnedZeroProb =
+            if (zeroProbTests > 0) {
+              List(("Omitted results", s"${zeroProbTests}/${objectReps * inputReps} tests returned an Approximate with probability 0. These tests have been omitted from the calculation."))
+            } else List()
+
+          results ++ exampleFailures ++ testsReturnedZeroProb
+        }
+
+        val args = argsList.map {
+          case (name, value) =>
+            Prop.Arg(name, value, 0, value, Pretty.prettyAny(value), Pretty.prettyAny(value))
+        }
+
+        Prop.Result(success, args = args)
+      }
+    }
+
+  /**
+   * Converts a list of ApproximateProperties to a scalacheck Prop that
+   * fails if too many of the ApproximateProperties fail.
+   * TODO use `new Prop` like the above `toProp` method so that we can
+   * have useful error messages.
+   */
+  def toProp(a: Seq[ApproximateProperty], objectReps: Int, inputReps: Int, falsePositiveRate: Double): Prop = {
+    require(0 <= falsePositiveRate && falsePositiveRate <= 1)
+
+    val list = a.flatMap { approximateProp =>
+      successesAndProbabilities(approximateProp, objectReps, inputReps)
+    }
+    val monoid = implicitly[Monoid[(Int, Double, List[String])]]
+    val (successes, sumOfProbabilities, _) = monoid.sum(list)
+    val n = list.length
+
+    (sumOfProbabilities - successes) > scala.math.sqrt(n * scala.math.log(falsePositiveRate) / -2)
+  }
+}
+
+/**
+ * All tests that use ApproximateProperty should extend from this class so that
+ * the scalacheck property is run exactly once.
+ */
+abstract class ApproximateProperties(name: String) extends Properties(name) {
+  def overrideParameters(p: Test.Parameters): Test.Parameters =
+    p.withMinSuccessfulTests(1)
+}
diff --git a/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala b/algebird-test/src/test/scala/com/twitter/algebird/BloomFilterTest.scala
@@ -2,7 +2,7 @@ package com.twitter.algebird
 
 import java.io.{ ByteArrayOutputStream, ObjectOutputStream }
 
-import org.scalacheck.{ Arbitrary, Gen }
+import org.scalacheck.{ Arbitrary, Gen, Properties }
 import org.scalatest.{ Matchers, WordSpec }
 import org.scalacheck.Prop._
 
@@ -95,6 +95,86 @@ class BFHashIndices extends CheckProperties {
   }
 }
 
+class BloomFilterFalsePositives[T: Gen](falsePositiveRate: Double) extends ApproximateProperty {
+  type Exact = Set[T]
+  type Approx = BF
+
+  type Input = T
+  type Result = Boolean
+
+  val maxNumEntries = 1000
+
+  val seed = 1
+
+  def exactGenerator = for {
+    numEntries <- Gen.choose(1, maxNumEntries)
+    set <- Gen.containerOfN[Set, T](numEntries, implicitly[Gen[T]])
+  } yield set
+
+  def makeApproximate(set: Set[T]) = {
+    val bfMonoid = BloomFilter(set.size, falsePositiveRate, seed)
+
+    val strings = set.map(_.toString).toSeq
+    bfMonoid.create(strings: _*)
+  }
+
+  def inputGenerator(set: Set[T]) =
+    for {
+      randomValues <- Gen.listOfN[T](set.size, implicitly[Gen[T]])
+      x <- Gen.oneOf((set ++ randomValues).toSeq)
+    } yield x
+
+  def exactResult(s: Set[T], t: T) = s.contains(t)
+
+  def approximateResult(bf: BF, t: T) = bf.contains(t.toString)
+}
+
+class BloomFilterCardinality[T: Gen] extends ApproximateProperty {
+  type Exact = Set[T]
+  type Approx = BF
+
+  type Input = Unit
+  type Result = Long
+
+  val maxNumEntries = 10000
+  val falsePositiveRate = 0.01
+
+  val seed = 1
+
+  def exactGenerator = for {
+    numEntries <- Gen.choose(1, maxNumEntries)
+    set <- Gen.containerOfN[Set, T](numEntries, implicitly[Gen[T]])
+  } yield set
+
+  def makeApproximate(set: Set[T]) = {
+    val bfMonoid = BloomFilter(set.size, falsePositiveRate, seed)
+
+    val strings = set.map(_.toString).toSeq
+    bfMonoid.create(strings: _*)
+  }
+
+  def inputGenerator(set: Set[T]) = Gen.const(())
+
+  def exactResult(s: Set[T], u: Unit) = s.size
+  def approximateResult(bf: BF, u: Unit) = bf.size
+}
+
+class BloomFilterProperties extends ApproximateProperties("BloomFilter") {
+  import ApproximateProperty.toProp
+
+  for (falsePositiveRate <- List(0.1, 0.01, 0.001)) {
+    property(s"has small false positive rate with false positive rate = $falsePositiveRate") = {
+      implicit val intGen = Gen.choose(1, 1000)
+      toProp(new BloomFilterFalsePositives[Int](falsePositiveRate), 50, 50, 0.01)
+    }
+  }
+
+  property("approximate cardinality") = {
+    implicit val intGen = Gen.choose(1, 1000)
+    toProp(new BloomFilterCardinality[Int], 50, 1, 0.01)
+  }
+}
+
 class BloomFilterTest extends WordSpec with Matchers {
 
   val SEED = 1
@@ -118,45 +198,6 @@ class BloomFilterTest extends WordSpec with Matchers {
       }
     }
 
-    "have small false positive rate" in {
-      val iter = 10000
-
-      Seq(0.1, 0.01, 0.001).foreach { fpProb =>
-        {
-          val fps = (0 until iter).par.map{
-            _ =>
-              {
-                val numEntries = RAND.nextInt(10) + 1
-
-                val bfMonoid = BloomFilter(numEntries, fpProb, SEED)
-
-                val entries = RAND.shuffle((0 until 1000).toList).take(numEntries + 1).map(_.toString)
-                val bf = bfMonoid.create(entries.drop(1): _*)
-
-                if (bf.contains(entries(0)).isTrue) 1.0 else 0.0
-              }
-          }
-
-          val observedFpProb = fps.sum / fps.size
-
-          assert(observedFpProb <= 2 * fpProb)
-        }
-      }
-    }
-
-    "approximate cardinality" in {
-      val bfMonoid = BloomFilterMonoid(10, 100000, SEED)
-      Seq(10, 100, 1000, 10000).foreach { exactCardinality =>
-        val items = (1 until exactCardinality).map { _.toString }
-        val bf = bfMonoid.create(items: _*)
-        val size = bf.size
-
-        assert(size ~ exactCardinality)
-        assert(size.min <= size.estimate)
-        assert(size.max >= size.estimate)
-      }
-    }
-
     "work as an Aggregator" in {
       (0 to 10).foreach{
         _ =>