343 lines
12 KiB
Java
343 lines
12 KiB
Java
![]() |
/*
|
|||
|
* Copyright (C) 2020 The Android Open Source Project
|
|||
|
*
|
|||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|||
|
* you may not use this file except in compliance with the License.
|
|||
|
* You may obtain a copy of the License at
|
|||
|
*
|
|||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|||
|
*
|
|||
|
* Unless required by applicable law or agreed to in writing, software
|
|||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|||
|
* See the License for the specific language governing permissions and
|
|||
|
* limitations under the License.
|
|||
|
*/
|
|||
|
|
|||
|
package com.android.internal.util;
|
|||
|
|
|||
|
import android.annotation.NonNull;
|
|||
|
import android.annotation.Nullable;
|
|||
|
import android.util.SparseArray;
|
|||
|
import android.util.SparseIntArray;
|
|||
|
|
|||
|
import java.util.ArrayList;
|
|||
|
import java.util.Collections;
|
|||
|
import java.util.List;
|
|||
|
|
|||
|
/**
|
|||
|
* <p>A utility which processes a sequence of input (stream) and output the heavy hitters
|
|||
|
* (the frequent ones).</p>
|
|||
|
*
|
|||
|
* @param <T> The type of the input.
|
|||
|
* @see <a href="https://en.wikipedia.org/wiki/Streaming_algorithm">Stream Algorithm</a> for
|
|||
|
* the definion of heavy hitters and the list of algorithms for detecting it.
|
|||
|
* <p>
|
|||
|
* {@hide}
|
|||
|
*/
|
|||
|
@android.ravenwood.annotation.RavenwoodKeepWholeClass
|
|||
|
public interface HeavyHitterSketch<T> {
|
|||
|
/**
|
|||
|
* Return the default implementation.
|
|||
|
*
|
|||
|
* @return The default implementation.
|
|||
|
*/
|
|||
|
static <V> @NonNull HeavyHitterSketch<V> newDefault() {
|
|||
|
return new HeavyHitterSketchImpl<V>();
|
|||
|
}
|
|||
|
|
|||
|
/**
|
|||
|
* Set the configuration with given parameters
|
|||
|
*
|
|||
|
* @param inputSize The amount of the input.
|
|||
|
* @param capacity The maximum number of distinct input it should track; it defines the lower
|
|||
|
* bound of the output.
|
|||
|
*/
|
|||
|
void setConfig(int inputSize, int capacity);
|
|||
|
|
|||
|
/**
|
|||
|
* Add a new input to the current sketch.
|
|||
|
*
|
|||
|
* @param newInstance The new input
|
|||
|
*/
|
|||
|
void add(@Nullable T newInstance);
|
|||
|
|
|||
|
/**
|
|||
|
* @param k The number of heavy hitters it should return, k < capacity, a value of 0
|
|||
|
* will be equivalent to capacity - 1
|
|||
|
* @param holder The list array into which the elements of the tops are to be stored; a new list
|
|||
|
* would be created and returned if this parameter is null.
|
|||
|
* @param freqs Optional, the frequencies of each items in the returned result
|
|||
|
* @return The top K heavy hitters(frequent input)
|
|||
|
*/
|
|||
|
@Nullable
|
|||
|
List<T> getTopHeavyHitters(int k, @Nullable List<T> holder, @Nullable List<Float> freqs);
|
|||
|
|
|||
|
/**
|
|||
|
* @param holder The list array into which the elements of the candidates are to be stored; a
|
|||
|
* new list would be created and returned if this parameter is null.
|
|||
|
* @return The candidate heavy hitters so far, it could include false postives.
|
|||
|
*/
|
|||
|
@Nullable
|
|||
|
List<T> getCandidates(@Nullable List<T> holder);
|
|||
|
|
|||
|
/**
|
|||
|
* Reset this heavy hitter counter
|
|||
|
*/
|
|||
|
void reset();
|
|||
|
|
|||
|
/**
|
|||
|
* @return The ratio of the input to be used as the validation data, Float.NaN means no
|
|||
|
* validation is needed.
|
|||
|
*/
|
|||
|
float getRequiredValidationInputRatio();
|
|||
|
|
|||
|
/**
|
|||
|
* The default implementation of the {@link HeavyHitterSketch}.
|
|||
|
*
|
|||
|
* <p>Currently it consists of two passes: the first pass will take the input into
|
|||
|
* the MG(Misra–Gries) summary; while the secondary pass will validate the output against the
|
|||
|
* input in order to eliminate false postivies.</p>
|
|||
|
*
|
|||
|
* <p>For sure there are better approaches which takes only one pass, but also comes along with
|
|||
|
* overheads in terms of cpu/memory cost; the MG summary would be a trivial and good enough
|
|||
|
* pick.</p>
|
|||
|
*
|
|||
|
* @param <T> The type of the input.
|
|||
|
* @see <a href="https://en.wikipedia.org/wiki/Misra%E2%80%93Gries_summary">Misra–Gries
|
|||
|
* summary</a> for the detailed explanation of the algorithm.
|
|||
|
*/
|
|||
|
final class HeavyHitterSketchImpl<T> implements HeavyHitterSketch<T> {
|
|||
|
/**
|
|||
|
* The array to track the current heavy hitters, its size < {@link #mCapacity}.
|
|||
|
*/
|
|||
|
private final SparseArray<T> mObjects = new SparseArray<>();
|
|||
|
|
|||
|
/**
|
|||
|
* The frequencies of the current heavy hitters, its size < {@link #mCapacity}.
|
|||
|
*/
|
|||
|
private final SparseIntArray mFrequencies = new SparseIntArray();
|
|||
|
|
|||
|
/**
|
|||
|
* The amount of the input of each pass
|
|||
|
*/
|
|||
|
private int mPassSize;
|
|||
|
|
|||
|
/**
|
|||
|
* The amount of the total input it expects
|
|||
|
*/
|
|||
|
private int mTotalSize;
|
|||
|
|
|||
|
/**
|
|||
|
* The maximum number of distinct input it should track
|
|||
|
*/
|
|||
|
private int mCapacity;
|
|||
|
|
|||
|
/**
|
|||
|
* The amount of inputs it already received.
|
|||
|
*/
|
|||
|
private int mNumInputs;
|
|||
|
|
|||
|
/**
|
|||
|
* Whether or not it's configured properly.
|
|||
|
*/
|
|||
|
private boolean mConfigured;
|
|||
|
|
|||
|
/**
|
|||
|
* Set the configuration with given parameters
|
|||
|
*
|
|||
|
* @param inputSize The amount of the input.
|
|||
|
* @param capacity The maximum number of distinct input it should track; it defines the
|
|||
|
* lower bound of the output.
|
|||
|
*/
|
|||
|
public void setConfig(final int inputSize, final int capacity) {
|
|||
|
if (inputSize < capacity || inputSize <= 1) {
|
|||
|
mConfigured = false;
|
|||
|
throw new IllegalArgumentException();
|
|||
|
}
|
|||
|
reset();
|
|||
|
mTotalSize = inputSize;
|
|||
|
mPassSize = inputSize >> 1;
|
|||
|
mCapacity = capacity;
|
|||
|
mConfigured = true;
|
|||
|
}
|
|||
|
|
|||
|
/**
|
|||
|
* Add a new input to the current sketch.
|
|||
|
*
|
|||
|
* @param newInstance The new input
|
|||
|
*/
|
|||
|
@Override
|
|||
|
public void add(@Nullable final T newInstance) {
|
|||
|
if (!mConfigured) {
|
|||
|
throw new IllegalStateException();
|
|||
|
}
|
|||
|
if (mNumInputs < mPassSize) {
|
|||
|
addToMGSummary(newInstance);
|
|||
|
} else if (mNumInputs < mTotalSize) {
|
|||
|
// Validation pass
|
|||
|
validate(newInstance);
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
/**
|
|||
|
* Add an input to the MG summary.
|
|||
|
*
|
|||
|
* <p>Note the frequency in the result set is an estimation. Every (obj, freq') pair
|
|||
|
* in the result set, will have the following property:
|
|||
|
* <code>(freq - inputSize / capacity) ≤ freq' ≤ freq</code>
|
|||
|
* The above freq' is the estimated frequency, while the freq is the actual frequency.
|
|||
|
* </p>
|
|||
|
*/
|
|||
|
private void addToMGSummary(@Nullable final T newInstance) {
|
|||
|
final int hashCode = newInstance != null ? newInstance.hashCode() : 0;
|
|||
|
final int index = mObjects.indexOfKey(hashCode);
|
|||
|
// MG summary
|
|||
|
if (index >= 0) {
|
|||
|
mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1);
|
|||
|
} else if (mObjects.size() < mCapacity - 1) {
|
|||
|
mObjects.put(hashCode, newInstance);
|
|||
|
mFrequencies.put(hashCode, 1);
|
|||
|
} else {
|
|||
|
for (int i = mFrequencies.size() - 1; i >= 0; i--) {
|
|||
|
final int val = mFrequencies.valueAt(i) - 1;
|
|||
|
if (val == 0) {
|
|||
|
mObjects.removeAt(i);
|
|||
|
mFrequencies.removeAt(i);
|
|||
|
} else {
|
|||
|
mFrequencies.setValueAt(i, val);
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
if (++mNumInputs == mPassSize) {
|
|||
|
// Clear all the frequencies as we are going to validate them next
|
|||
|
for (int i = mFrequencies.size() - 1; i >= 0; i--) {
|
|||
|
mFrequencies.setValueAt(i, 0);
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
/**
|
|||
|
* Validate the results from MG summary; the ones with frequencies less than lower boundary
|
|||
|
* will be removed from the set.
|
|||
|
*/
|
|||
|
private void validate(@Nullable final T newInstance) {
|
|||
|
final int hashCode = newInstance != null ? newInstance.hashCode() : 0;
|
|||
|
final int index = mObjects.indexOfKey(hashCode);
|
|||
|
if (index >= 0) {
|
|||
|
mFrequencies.setValueAt(index, mFrequencies.valueAt(index) + 1);
|
|||
|
}
|
|||
|
if (++mNumInputs == mTotalSize) {
|
|||
|
final int lower = mPassSize / mCapacity;
|
|||
|
// Remove any occurrences with frequencies less than lower boundary
|
|||
|
for (int i = mFrequencies.size() - 1; i >= 0; i--) {
|
|||
|
final int val = mFrequencies.valueAt(i);
|
|||
|
if (val < lower) {
|
|||
|
mFrequencies.removeAt(i);
|
|||
|
mObjects.removeAt(i);
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
/**
|
|||
|
* @param k The number of heavy hitters it should return, k < capacity, a value of 0
|
|||
|
* will be equivalent to capacity - 1
|
|||
|
* @param holder The list array into which the elements of the tops are to be stored; a new
|
|||
|
* list would be created and returned if this parameter is null.
|
|||
|
* @param freqs Optional, the frequencies of each items in the returned result
|
|||
|
* @return The top K heavy hitters(frequent input)
|
|||
|
*/
|
|||
|
@Override
|
|||
|
@Nullable
|
|||
|
public List<T> getTopHeavyHitters(final int k, final @Nullable List<T> holder,
|
|||
|
final @Nullable List<Float> freqs) {
|
|||
|
if (!mConfigured) {
|
|||
|
throw new IllegalStateException();
|
|||
|
}
|
|||
|
|
|||
|
if (k >= mCapacity) {
|
|||
|
throw new IllegalArgumentException();
|
|||
|
}
|
|||
|
|
|||
|
if (mNumInputs < mTotalSize) {
|
|||
|
// It hasn't had all the inputs yet.
|
|||
|
throw new IllegalStateException();
|
|||
|
}
|
|||
|
|
|||
|
ArrayList<Integer> indexes = null;
|
|||
|
for (int i = mFrequencies.size() - 1; i >= 0; i--) {
|
|||
|
final int val = mFrequencies.valueAt(i);
|
|||
|
if (val > 0) {
|
|||
|
if (indexes == null) {
|
|||
|
indexes = new ArrayList<>();
|
|||
|
}
|
|||
|
indexes.add(i);
|
|||
|
}
|
|||
|
}
|
|||
|
if (indexes == null) {
|
|||
|
return null;
|
|||
|
}
|
|||
|
|
|||
|
Collections.sort(indexes, (a, b) -> mFrequencies.valueAt(b) - mFrequencies.valueAt(a));
|
|||
|
|
|||
|
final List<T> result = holder != null ? holder : new ArrayList<T>();
|
|||
|
final int max = Math.min(k == 0 ? (mCapacity - 1) : k, indexes.size());
|
|||
|
for (int i = 0; i < max; i++) {
|
|||
|
final int index = indexes.get(i);
|
|||
|
final T obj = mObjects.valueAt(index);
|
|||
|
if (obj != null) {
|
|||
|
result.add(obj);
|
|||
|
if (freqs != null) {
|
|||
|
freqs.add((float) mFrequencies.valueAt(index) / mPassSize);
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
return result;
|
|||
|
}
|
|||
|
|
|||
|
/**
|
|||
|
* @param holder The list array into which the elements of the candidates are to be stored;
|
|||
|
* a new list would be created and returned if this parameter is null.
|
|||
|
* @return The candidate heavy hitters so far, it could include false postives.
|
|||
|
*/
|
|||
|
@Nullable
|
|||
|
public List<T> getCandidates(final @Nullable List<T> holder) {
|
|||
|
if (!mConfigured) {
|
|||
|
throw new IllegalStateException();
|
|||
|
}
|
|||
|
if (mNumInputs < mPassSize) {
|
|||
|
// It hasn't done with the first pass yet, return nothing
|
|||
|
return null;
|
|||
|
}
|
|||
|
|
|||
|
List<T> result = holder != null ? holder : new ArrayList<T>();
|
|||
|
for (int i = mObjects.size() - 1; i >= 0; i--) {
|
|||
|
final T obj = mObjects.valueAt(i);
|
|||
|
if (obj != null) {
|
|||
|
result.add(obj);
|
|||
|
}
|
|||
|
}
|
|||
|
return result;
|
|||
|
}
|
|||
|
|
|||
|
/**
|
|||
|
* Reset this heavy hitter counter
|
|||
|
*/
|
|||
|
@Override
|
|||
|
public void reset() {
|
|||
|
mNumInputs = 0;
|
|||
|
mObjects.clear();
|
|||
|
mFrequencies.clear();
|
|||
|
}
|
|||
|
|
|||
|
/**
|
|||
|
* @return The ratio of the input to be used as the validation data, Float.NaN means no
|
|||
|
* validation is needed.
|
|||
|
*/
|
|||
|
public float getRequiredValidationInputRatio() {
|
|||
|
return 0.5f;
|
|||
|
}
|
|||
|
}
|
|||
|
}
|