Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
9da8eb1
basic implementation of ApproximateProperty
sid-kap Aug 3, 2015
b4da480
Some work
sid-kap Aug 4, 2015
e60f0b5
more stuff
sid-kap Aug 4, 2015
05bdf25
work on tests
sid-kap Aug 4, 2015
e9fedb6
Basic implementation of ApproximateProperty
sid-kap Aug 4, 2015
de57b71
More changes
sid-kap Aug 5, 2015
409ffc9
Work on hyperloglogtests
sid-kap Aug 5, 2015
0f997f3
Work on HyperLogLogTests
sid-kap Aug 5, 2015
118a380
Sample failing test
sid-kap Aug 5, 2015
d0c0187
Fix HLL intersection test (now fails weirdly)
sid-kap Aug 5, 2015
c66d1c3
Add intersection size == sizeOf sanity test
sid-kap Aug 5, 2015
1e5e9a2
Add comment for toProp
sid-kap Aug 5, 2015
9bd6a7f
Fix HLL intersection test
sid-kap Aug 6, 2015
587bb63
Work more on HyperLogLogTests
sid-kap Aug 7, 2015
b76b6af
Use Hash128 in HyperLogLogTests
sid-kap Aug 10, 2015
4963b1b
Add SetSizeHashAggregator tests
sid-kap Aug 10, 2015
931aad9
Merge branch 'develop' into test_failure_bounds
sid-kap Aug 10, 2015
96d94bc
Add support for ApproximateBoolean
sid-kap Aug 10, 2015
c6da416
Test HyperLogLogSeries using ApproximateProperty
sid-kap Aug 11, 2015
e56b612
Write BloomFilterTests, handle case where prob = 0
sid-kap Aug 11, 2015
46390c9
Revert accidental change to version.sbt
sid-kap Aug 12, 2015
97f06eb
Refactor GeneralizedApproximate[T] -> ApproximateSet[T]
sid-kap Aug 13, 2015
96e953a
Merge branch 'develop' into test_failure_bounds
sid-kap Aug 19, 2015
6653e65
Small change in CountMinSketchTest
sid-kap Aug 19, 2015
a750192
Fix incorrect merge conflict resolution
sid-kap Aug 20, 2015
9e599c1
Merge branch 'develop' into test_failure_bounds
sid-kap Oct 4, 2015
185eab3
Update Scalacheck version
sid-kap Oct 4, 2015
bdb7354
Travis, please run the tests again
sid-kap Oct 4, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,13 @@ limitations under the License.

package com.twitter.algebird

private[algebird] trait ApproximateSet[T] {
def contains(t: T): ApproximateBoolean
}

// This gives an answer, and a LOWER BOUND on the probability that answer is
// correct
case class ApproximateBoolean(isTrue: Boolean, withProb: Double) {
case class ApproximateBoolean(isTrue: Boolean, withProb: Double) extends ApproximateSet[Boolean] {

def not: ApproximateBoolean = ApproximateBoolean(!isTrue, withProb)

Expand Down Expand Up @@ -58,6 +62,8 @@ case class ApproximateBoolean(isTrue: Boolean, withProb: Double) {
ApproximateBoolean(false, newP)
}
}

def contains(b: Boolean): ApproximateBoolean = if (isTrue) this else not
}

object ApproximateBoolean {
Expand All @@ -67,7 +73,7 @@ object ApproximateBoolean {
}

// Note the probWithinBounds is a LOWER BOUND (at least this probability)
case class Approximate[N](min: N, estimate: N, max: N, probWithinBounds: Double)(implicit val numeric: Numeric[N]) {
case class Approximate[N](min: N, estimate: N, max: N, probWithinBounds: Double)(implicit val numeric: Numeric[N]) extends ApproximateSet[N] {
require(numeric.lteq(min, estimate) && numeric.lteq(estimate, max))

/**
Expand Down Expand Up @@ -113,10 +119,10 @@ case class Approximate[N](min: N, estimate: N, max: N, probWithinBounds: Double)
this
} else {
val n = numeric
val ends = for (
leftv <- List(min, max);
val ends = for {
leftv <- List(min, max)
rightv <- List(right.min, right.max)
) yield n.times(leftv, rightv)
} yield n.times(leftv, rightv)

val newProb = probWithinBounds * right.probWithinBounds

Expand Down Expand Up @@ -159,7 +165,7 @@ object Approximate {
// Not a group/ring:
// negate fails: x - x != 0, because with some probability the bound is bad.
// distributive fails because a*b + a*c ignores that a is either in or out
// of the bound, and counts it idependently.
// of the bound, and counts it independently.
implicit def monoid[N](implicit n: Numeric[N]): Monoid[Approximate[N]] = {
// avoid capturing the Numeric:
val z = Approximate.zero[N]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,15 @@ object HyperLogLog {
buf
}

implicit def int2Bytes(i: Int) = {
implicit def int2Bytes(i: Int): Array[Byte] = {
val buf = new Array[Byte](4)
ByteBuffer
.wrap(buf)
.putInt(i)
buf
}

implicit def long2Bytes(i: Long) = {
implicit def long2Bytes(i: Long): Array[Byte] = {
val buf = new Array[Byte](8)
ByteBuffer
.wrap(buf)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package com.twitter.algebird

import org.scalacheck.{ Gen, Prop, Properties, Test }
import org.scalacheck.util.Pretty

trait ApproximateProperty {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is really nice and quite well designed. Good work.

type Exact
type Approx
type Input
type Result

def exactGenerator: Gen[Exact]
def inputGenerator(e: Exact): Gen[Input]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is this exact value corresponding to?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exact is useful here so that we can generate appropriate test inputs.

For example, suppose we're testing CMS[Long]. We need to generate some inputs so that we can query exactList.frequency(x) and cms.frequency(x) and compare them.
We could simply query using random Long values. However, most of the random Long values we generate would probably not be in exactList, so these test cases would not be very useful. If we have exactList, we can choose random elements from exactList as our test cases.

I'm not sure if this is the best way to do this.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but why couldn't we do something like:

// I need an exact value for my implementation:
def needExact(e: Exact): Gen[Input]
// but I could still implement this:
def inputGenerator: Gen[Input] = exactGenerator.flatMap(needExact) 

Am I missing something? Putting this in the trait as is seems to be constraining more than is needed. Why can't we just get by with def inputGenerator: Gen[Input]


def makeApproximate(e: Exact): Approx

def exactResult(e: Exact, i: Input): Result
def approximateResult(a: Approx, i: Input): ApproximateSet[Result]
}

object ApproximateProperty {

/**
* Generates a list of exactly n Ts.
* Useful because `Gen.listOfN(n, gen).sample` gives us Option[List[T]],
* while we often want List[T].
*/
private def genListOf[T](n: Int, gen: Gen[T]): List[T] = {
Gen.listOfN(n, gen).sample match {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm concerned that this is not using the Gen.Parameters, so I'm afraid that when scalacheck gives us an example input that fails a test, we can't check it at the REPL. Is there some way to plumb through the Random instance from Gen.Parameters (I think has it), so that we are not using a time-seeded Random?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This might not a problem since I'm not using scalacheck's error reporting mechanism. (I had to write my own hacky error reporter using scalacheck args.)
Here's an example of a failure:
https://travis-ci.org/twitter/algebird/jobs/74179669#L2565
It always prints successes, expected successes, and required successes. When a test fails, it prints some details from the first few failures.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That link just gives me "This is not an active repository". Are there some creds needed to see it? Can you post a screenshot?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It actually looks like the travis settings somehow got broken for algebird, it seems to be disabled in travis. /cc @caniszczyk

case Some(xs) => xs
case _ => genListOf(n, gen)
}
}

private def successesAndProbabilities(a: ApproximateProperty, objectReps: Int, inputReps: Int): List[(Int, Double, List[String])] =
genListOf(objectReps, a.exactGenerator)
.flatMap { exact =>
val approx = a.makeApproximate(exact)
genListOf(inputReps, a.inputGenerator(exact)).flatMap { input =>
val approxResult = a.approximateResult(approx, input)
val exactResult = a.exactResult(exact, input)

val success = approxResult.contains(exactResult)
if (success.withProb == 0.0) {
None
} else {
val successInt = if (success.isTrue) 1 else 0
val messages = if (success.isTrue) List() else List(s"Exact result: $exactResult. Approx result: $approxResult.")
Some((successInt, success.withProb, messages))
}
}
}

def toProp(a: ApproximateProperty, objectReps: Int, inputReps: Int, falsePositiveRate: Double): Prop =
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A better name for falsePositiveRate might be falseFailureRate

new Prop {
def apply(params: Gen.Parameters) = {
require(0 <= falsePositiveRate && falsePositiveRate <= 1)

val list = successesAndProbabilities(a, objectReps, inputReps)
val n = list.length

val monoid = implicitly[Monoid[(Int, Double, List[String])]]
val (successes, sumOfProbabilities, exacts) = monoid.sum(list)

// Computed from Hoeffding's inequality, might be inaccurate
// TODO Make sure this is correct
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's link to:
https://en.wikipedia.org/wiki/Hoeffding%27s_inequality

and add the result;

When Prob(X = 1) >= p, then

Prob(sum(X) <= (p - eps)*n) <= exp(-2*eps*eps*n)

If we want that Prob <= fprate, then it is sufficient that, fprate = exp(-2*eps*eps*n) or:
eps = math.sqrt( - math.log(fprate)/(2.0 * n) )

So, I think there is a slight error in your version (n is in the numerator in yours).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

diff = n * fprate, so it's correct in the numerator.

My only concern here is that the formula is for the Bernoulli version of Hoeffding's. Since we're taking separate probability values from each result, we aren't guaranteed that they'll be uniform (very possibly they won't be). But maybe it's "close enough."

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The general case (https://en.wikipedia.org/wiki/Hoeffding%27s_inequality#General_case) is

P(sum(X) - sum(E[X]) >= nt) <= exp(-2*n*t^2)

(here I used sum(X) = X_1 + ... + X_n instead of Xbar = 1/n (X_1 + ... + X_N))
So we have

   exp(-2*n*t^2) = fprate
=> t = sqrt(-log(fprate) / (2*n) )

So I think Oscar's right.
I don't think this assumes the Bernoulli distribution.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops, I read Joe's comment again, and he's right.
diff = n * t, so we need diff = sqrt(-n * log(fprate) / 2).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree. I overlooked the n * t on the outside.

I think the bound applies perfectly, with the assumption that our Random class is a real RNG. That is obviously not true, and in fact it is pseudo-random, but I don't see how that is escapable. We could beef up the RNG used to be stronger (but certainly slower), but that might require some changes to scalacheck, not sure.

val diff = scala.math.sqrt(-n * scala.math.log(falsePositiveRate) / 2.0)

val success = if (successes >= (sumOfProbabilities - diff)) Prop.Proof else Prop.False

// Args that get printed when Scalacheck runs the test
val argsList: List[(String, String)] = {
val results = List(("Successes", s"$successes (out of $n)"),
("Expected successes", "%.2f".format(sumOfProbabilities)),
("Required successes", "%.2f".format(sumOfProbabilities - diff)))

val exampleFailures =
if (success == Prop.False)
List(("Example failures:\n >", exacts.take(5).mkString("\n >")))
else List()

val zeroProbTests = objectReps * inputReps - n
val testsReturnedZeroProb =
if (zeroProbTests > 0) {
List(("Omitted results", s"${zeroProbTests}/${objectReps * inputReps} tests returned an Approximate with probability 0. These tests have been omitted from the calculation."))
} else List()

results ++ exampleFailures ++ testsReturnedZeroProb
}

val args = argsList.map {
case (name, value) =>
Prop.Arg(name, value, 0, value, Pretty.prettyAny(value), Pretty.prettyAny(value))
}

Prop.Result(success, args = args)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scalacheck prints these args when the test is run, so that we can see the number of successes, etc. when we run the tests

}
}

/**
* Converts a list of ApproximateProperties to a scalacheck Prop that
* fails if too many of the ApproximateProperties fail.
* TODO use `new Prop` like the above `toProp` method so that we can
* have useful error messages.
*/
def toProp(a: Seq[ApproximateProperty], objectReps: Int, inputReps: Int, falsePositiveRate: Double): Prop = {
require(0 <= falsePositiveRate && falsePositiveRate <= 1)

val list = a.flatMap { approximateProp =>
successesAndProbabilities(approximateProp, objectReps, inputReps)
}
val monoid = implicitly[Monoid[(Int, Double, List[String])]]
val (successes, sumOfProbabilities, _) = monoid.sum(list)
val n = list.length

(sumOfProbabilities - successes) > scala.math.sqrt(n * scala.math.log(falsePositiveRate) / -2)
}
}

/**
* All tests that use ApproximateProperty should extend from this class so that
* the scalacheck property is run exactly once.
*/
abstract class ApproximateProperties(name: String) extends Properties(name) {
def overrideParameters(p: Test.Parameters): Test.Parameters =
p.withMinSuccessfulTests(1)
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package com.twitter.algebird

import java.io.{ ByteArrayOutputStream, ObjectOutputStream }

import org.scalacheck.{ Arbitrary, Gen }
import org.scalacheck.{ Arbitrary, Gen, Properties }
import org.scalatest.{ Matchers, WordSpec }
import org.scalacheck.Prop._

Expand Down Expand Up @@ -95,6 +95,86 @@ class BFHashIndices extends CheckProperties {
}
}

class BloomFilterFalsePositives[T: Gen](falsePositiveRate: Double) extends ApproximateProperty {
type Exact = Set[T]
type Approx = BF

type Input = T
type Result = Boolean

val maxNumEntries = 1000

val seed = 1

def exactGenerator = for {
numEntries <- Gen.choose(1, maxNumEntries)
set <- Gen.containerOfN[Set, T](numEntries, implicitly[Gen[T]])
} yield set

def makeApproximate(set: Set[T]) = {
val bfMonoid = BloomFilter(set.size, falsePositiveRate, seed)

val strings = set.map(_.toString).toSeq
bfMonoid.create(strings: _*)
}

def inputGenerator(set: Set[T]) =
for {
randomValues <- Gen.listOfN[T](set.size, implicitly[Gen[T]])
x <- Gen.oneOf((set ++ randomValues).toSeq)
} yield x

def exactResult(s: Set[T], t: T) = s.contains(t)

def approximateResult(bf: BF, t: T) = bf.contains(t.toString)
}

class BloomFilterCardinality[T: Gen] extends ApproximateProperty {
type Exact = Set[T]
type Approx = BF

type Input = Unit
type Result = Long

val maxNumEntries = 10000
val falsePositiveRate = 0.01

val seed = 1

def exactGenerator = for {
numEntries <- Gen.choose(1, maxNumEntries)
set <- Gen.containerOfN[Set, T](numEntries, implicitly[Gen[T]])
} yield set

def makeApproximate(set: Set[T]) = {
val bfMonoid = BloomFilter(set.size, falsePositiveRate, seed)

val strings = set.map(_.toString).toSeq
bfMonoid.create(strings: _*)
}

def inputGenerator(set: Set[T]) = Gen.const(())

def exactResult(s: Set[T], u: Unit) = s.size
def approximateResult(bf: BF, u: Unit) = bf.size
}

class BloomFilterProperties extends ApproximateProperties("BloomFilter") {
import ApproximateProperty.toProp

for (falsePositiveRate <- List(0.1, 0.01, 0.001)) {
property(s"has small false positive rate with false positive rate = $falsePositiveRate") = {
implicit val intGen = Gen.choose(1, 1000)
toProp(new BloomFilterFalsePositives[Int](falsePositiveRate), 50, 50, 0.01)
}
}

property("approximate cardinality") = {
implicit val intGen = Gen.choose(1, 1000)
toProp(new BloomFilterCardinality[Int], 50, 1, 0.01)
}
}

class BloomFilterTest extends WordSpec with Matchers {

val SEED = 1
Expand All @@ -118,45 +198,6 @@ class BloomFilterTest extends WordSpec with Matchers {
}
}

"have small false positive rate" in {
val iter = 10000

Seq(0.1, 0.01, 0.001).foreach { fpProb =>
{
val fps = (0 until iter).par.map{
_ =>
{
val numEntries = RAND.nextInt(10) + 1

val bfMonoid = BloomFilter(numEntries, fpProb, SEED)

val entries = RAND.shuffle((0 until 1000).toList).take(numEntries + 1).map(_.toString)
val bf = bfMonoid.create(entries.drop(1): _*)

if (bf.contains(entries(0)).isTrue) 1.0 else 0.0
}
}

val observedFpProb = fps.sum / fps.size

assert(observedFpProb <= 2 * fpProb)
}
}
}

"approximate cardinality" in {
val bfMonoid = BloomFilterMonoid(10, 100000, SEED)
Seq(10, 100, 1000, 10000).foreach { exactCardinality =>
val items = (1 until exactCardinality).map { _.toString }
val bf = bfMonoid.create(items: _*)
val size = bf.size

assert(size ~ exactCardinality)
assert(size.min <= size.estimate)
assert(size.max >= size.estimate)
}
}

"work as an Aggregator" in {
(0 to 10).foreach{
_ =>
Expand Down
Loading