343 lines
12 KiB
Java
343 lines
12 KiB
Java
/*
|
||
* Copyright (C) 2020 The Android Open Source Project
|
||
*
|
||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
* you may not use this file except in compliance with the License.
|
||
* You may obtain a copy of the License at
|
||
*
|
||
* http://www.apache.org/licenses/LICENSE-2.0
|
||
*
|
||
* Unless required by applicable law or agreed to in writing, software
|
||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
* See the License for the specific language governing permissions and
|
||
* limitations under the License.
|
||
*/
|
||
|
||
package com.android.internal.util;
|
||
|
||
import android.annotation.NonNull;
|
||
import android.annotation.Nullable;
|
||
import android.util.SparseArray;
|
||
import android.util.SparseIntArray;
|
||
|
||
import java.util.ArrayList;
|
||
import java.util.Collections;
|
||
import java.util.List;
|
||
|
||
/**
|
||
* <p>A utility which processes a sequence of input (stream) and output the heavy hitters
|
||
* (the frequent ones).</p>
|
||
*
|
||
* @param <T> The type of the input.
|
||
* @see <a href="https://en.wikipedia.org/wiki/Streaming_algorithm">Stream Algorithm</a> for
|
||
* the definion of heavy hitters and the list of algorithms for detecting it.
|
||
* <p>
|
||
* {@hide}
|
||
*/
|
||
@android.ravenwood.annotation.RavenwoodKeepWholeClass
|
||
public interface HeavyHitterSketch<T> {
|
||
/**
|
||
* Return the default implementation.
|
||
*
|
||
* @return The default implementation.
|
||
*/
|
||
static <V> @NonNull HeavyHitterSketch<V> newDefault() {
|
||
return new HeavyHitterSketchImpl<V>();
|
||
}
|
||
|
||
/**
|
||
* Set the configuration with given parameters
|
||
*
|
||
* @param inputSize The amount of the input.
|
||
* @param capacity The maximum number of distinct input it should track; it defines the lower
|
||
* bound of the output.
|
||
*/
|
||
void setConfig(int inputSize, int capacity);
|
||
|
||
/**
|
||
* Add a new input to the current sketch.
|
||
*
|
||
* @param newInstance The new input
|
||
*/
|
||
void add(@Nullable T newInstance);
|
||
|
||
/**
|
||
* @param k The number of heavy hitters it should return, k < capacity, a value of 0
|
||
* will be equivalent to capacity - 1
|
||
* @param holder The list array into which the elements of the tops are to be stored; a new list
|
||
* would be created and returned if this parameter is null.
|
||
* @param freqs Optional, the frequencies of each items in the returned result
|
||
* @return The top K heavy hitters(frequent input)
|
||
*/
|
||
@Nullable
|
||
List<T> getTopHeavyHitters(int k, @Nullable List<T> holder, @Nullable List<Float> freqs);
|
||
|
||
/**
|
||
* @param holder The list array into which the elements of the candidates are to be stored; a
|
||
* new list would be created and returned if this parameter is null.
|
||
* @return The candidate heavy hitters so far, it could include false postives.
|
||
*/
|
||
@Nullable
|
||
List<T> getCandidates(@Nullable List<T> holder);
|
||
|
||
/**
|
||
* Reset this heavy hitter counter
|
||
*/
|
||
void reset();
|
||
|
||
/**
|
||
* @return The ratio of the input to be used as the validation data, Float.NaN means no
|
||
* validation is needed.
|
||
*/
|
||
float getRequiredValidationInputRatio();
|
||
|
||
/**
|
||
* The default implementation of the {@link HeavyHitterSketch}.
|
||
*
|
||
* <p>Currently it consists of two passes: the first pass will take the input into
|
||
* the MG(Misra–Gries) summary; while the secondary pass will validate the output against the
|
||
* input in order to eliminate false postivies.</p>
|
||
*
|
||
* <p>For sure there are better approaches which takes only one pass, but also comes along with
|
||
* overheads in terms of cpu/memory cost; the MG summary would be a trivial and good enough
|
||
* pick.</p>
|
||
*
|
||
* @param <T> The type of the input.
|
||
* @see <a href="https://en.wikipedia.org/wiki/Misra%E2%80%93Gries_summary">Misra–Gries
|
||
* summary</a> for the detailed explanation of the algorithm.
|
||
*/
|
||
final class HeavyHitterSketchImpl<T> implements HeavyHitterSketch<T> {
|
||
/**
|
||
* The array to track the current heavy hitters, its size < {@link #mCapacity}.
|
||
*/
|
||
private final SparseArray<T> mObjects = new SparseArray<>();
|
||
|
||
/**
|
||
* The frequencies of the current heavy hitters, its size < {@link #mCapacity}.
|
||
*/
|
||
private final SparseIntArray mFrequencies = new SparseIntArray();
|
||
|
||
/**
|
||
* The amount of the input of each pass
|
||
*/
|
||
private int mPassSize;
|
||
|
||
/**
|
||
* The amount of the total input it expects
|
||
*/
|
||
private int mTotalSize;
|
||
|
||
/**
|
||
* The maximum number of distinct input it should track
|
||
*/
|
||
private int mCapacity;
|
||
|
||
/**
|
||
* The amount of inputs it already received.
|
||
*/
|
||
private int mNumInputs;
|
||
|
||
/**
|
||
* Whether or not it's configured properly.
|
||
*/
|
||
private boolean mConfigured;
|
||
|
||
/**
|
||
* Set the configuration with given parameters
|
||
*
|
||
* @param inputSize The amount of the input.
|
||
* @param capacity The maximum number of distinct input it should track; it defines the
|
||
* lower bound of the output.
|
||
*/
|
||
public void setConfig(final int inputSize, final int capacity) {
|
||
if (inputSize < capacity || inputSize <= 1) {
|
||
mConfigured = false;
|
||
throw new IllegalArgumentException();
|
||
}
|
||
reset();
|
||
mTotalSize = inputSize;
|
||
mPassSize = inputSize >> 1;
|
||
mCapacity = capacity;
|
||
mConfigured = true;
|
||
}
|
||
|
||
/**
|
||
* Add a new input to the current sketch.
|
||
*
|
||
* @param newInstance The new input
|
||
*/
|
||
@Override
|
||
public void add(@Nullable final T newInstance) {
|
||
if (!mConfigured) {
|
||
throw new IllegalStateException();
|
||
}
|
||
if (mNumInputs < mPassSize) {
|
||
addToMGSummary(newInstance);
|
||
} else if (mNumInputs < mTotalSize) {
|
||
// Validation pass
|
||
validate(newInstance);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Add an input to the MG summary.
|
||
*
|
||
* <p>Note the frequency in the result set is an estimation. Every (obj, freq') pair
|
||
* in the result set, will have the following property:
|
||
* <code>(freq - inputSize / capacity) ≤ freq' ≤ freq</code>
|
||
* The above freq' is the estimated frequency, while the freq is the actual frequency.
|
||
* </p>
|
||
*/
|
||
private void addToMGSummary(@Nullable final T newInstance) {
|
||
final int hashCode = newInstance != null ? newInstance.hashCode() : 0;
|
||
final int index = mObjects.indexOfKey(hashCode);
|
||
// MG summary
|
||
if (index >= 0) {
|
||
mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1);
|
||
} else if (mObjects.size() < mCapacity - 1) {
|
||
mObjects.put(hashCode, newInstance);
|
||
mFrequencies.put(hashCode, 1);
|
||
} else {
|
||
for (int i = mFrequencies.size() - 1; i >= 0; i--) {
|
||
final int val = mFrequencies.valueAt(i) - 1;
|
||
if (val == 0) {
|
||
mObjects.removeAt(i);
|
||
mFrequencies.removeAt(i);
|
||
} else {
|
||
mFrequencies.setValueAt(i, val);
|
||
}
|
||
}
|
||
}
|
||
if (++mNumInputs == mPassSize) {
|
||
// Clear all the frequencies as we are going to validate them next
|
||
for (int i = mFrequencies.size() - 1; i >= 0; i--) {
|
||
mFrequencies.setValueAt(i, 0);
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Validate the results from MG summary; the ones with frequencies less than lower boundary
|
||
* will be removed from the set.
|
||
*/
|
||
private void validate(@Nullable final T newInstance) {
|
||
final int hashCode = newInstance != null ? newInstance.hashCode() : 0;
|
||
final int index = mObjects.indexOfKey(hashCode);
|
||
if (index >= 0) {
|
||
mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1);
|
||
}
|
||
if (++mNumInputs == mTotalSize) {
|
||
final int lower = mPassSize / mCapacity;
|
||
// Remove any occurrences with frequencies less than lower boundary
|
||
for (int i = mFrequencies.size() - 1; i >= 0; i--) {
|
||
final int val = mFrequencies.valueAt(i);
|
||
if (val < lower) {
|
||
mFrequencies.removeAt(i);
|
||
mObjects.removeAt(i);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* @param k The number of heavy hitters it should return, k < capacity, a value of 0
|
||
* will be equivalent to capacity - 1
|
||
* @param holder The list array into which the elements of the tops are to be stored; a new
|
||
* list would be created and returned if this parameter is null.
|
||
* @param freqs Optional, the frequencies of each items in the returned result
|
||
* @return The top K heavy hitters(frequent input)
|
||
*/
|
||
@Override
|
||
@Nullable
|
||
public List<T> getTopHeavyHitters(final int k, final @Nullable List<T> holder,
|
||
final @Nullable List<Float> freqs) {
|
||
if (!mConfigured) {
|
||
throw new IllegalStateException();
|
||
}
|
||
|
||
if (k >= mCapacity) {
|
||
throw new IllegalArgumentException();
|
||
}
|
||
|
||
if (mNumInputs < mTotalSize) {
|
||
// It hasn't had all the inputs yet.
|
||
throw new IllegalStateException();
|
||
}
|
||
|
||
ArrayList<Integer> indexes = null;
|
||
for (int i = mFrequencies.size() - 1; i >= 0; i--) {
|
||
final int val = mFrequencies.valueAt(i);
|
||
if (val > 0) {
|
||
if (indexes == null) {
|
||
indexes = new ArrayList<>();
|
||
}
|
||
indexes.add(i);
|
||
}
|
||
}
|
||
if (indexes == null) {
|
||
return null;
|
||
}
|
||
|
||
Collections.sort(indexes, (a, b) -> mFrequencies.valueAt(b) - mFrequencies.valueAt(a));
|
||
|
||
final List<T> result = holder != null ? holder : new ArrayList<T>();
|
||
final int max = Math.min(k == 0 ? (mCapacity - 1) : k, indexes.size());
|
||
for (int i = 0; i < max; i++) {
|
||
final int index = indexes.get(i);
|
||
final T obj = mObjects.valueAt(index);
|
||
if (obj != null) {
|
||
result.add(obj);
|
||
if (freqs != null) {
|
||
freqs.add((float) mFrequencies.valueAt(index) / mPassSize);
|
||
}
|
||
}
|
||
}
|
||
return result;
|
||
}
|
||
|
||
/**
|
||
* @param holder The list array into which the elements of the candidates are to be stored;
|
||
* a new list would be created and returned if this parameter is null.
|
||
* @return The candidate heavy hitters so far, it could include false postives.
|
||
*/
|
||
@Nullable
|
||
public List<T> getCandidates(final @Nullable List<T> holder) {
|
||
if (!mConfigured) {
|
||
throw new IllegalStateException();
|
||
}
|
||
if (mNumInputs < mPassSize) {
|
||
// It hasn't done with the first pass yet, return nothing
|
||
return null;
|
||
}
|
||
|
||
List<T> result = holder != null ? holder : new ArrayList<T>();
|
||
for (int i = mObjects.size() - 1; i >= 0; i--) {
|
||
final T obj = mObjects.valueAt(i);
|
||
if (obj != null) {
|
||
result.add(obj);
|
||
}
|
||
}
|
||
return result;
|
||
}
|
||
|
||
/**
|
||
* Reset this heavy hitter counter
|
||
*/
|
||
@Override
|
||
public void reset() {
|
||
mNumInputs = 0;
|
||
mObjects.clear();
|
||
mFrequencies.clear();
|
||
}
|
||
|
||
/**
|
||
* @return The ratio of the input to be used as the validation data, Float.NaN means no
|
||
* validation is needed.
|
||
*/
|
||
public float getRequiredValidationInputRatio() {
|
||
return 0.5f;
|
||
}
|
||
}
|
||
}
|