mirror of
https://github.com/mitb-archive/filebot
synced 2025-01-11 05:48:01 -05:00
LocalSearch does not seem to benefit from any kind of paralleism and most time is spent in the initial transliterator/indexing step (which also doesn't get much faster with parallel processing)
This commit is contained in:
parent
0fa1d0f26f
commit
d5bacdcb23
@ -1,107 +1,54 @@
|
|||||||
package net.filebot.web;
|
package net.filebot.web;
|
||||||
|
|
||||||
import static java.util.Collections.*;
|
import static java.util.Collections.*;
|
||||||
|
import static java.util.Comparator.*;
|
||||||
|
import static java.util.stream.Collectors.*;
|
||||||
import static net.filebot.similarity.Normalization.*;
|
import static net.filebot.similarity.Normalization.*;
|
||||||
|
|
||||||
import java.util.AbstractList;
|
import java.util.AbstractMap.SimpleImmutableEntry;
|
||||||
import java.util.AbstractMap.SimpleEntry;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map.Entry;
|
import java.util.Map.Entry;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.Callable;
|
|
||||||
import java.util.concurrent.ExecutionException;
|
import java.util.concurrent.ExecutionException;
|
||||||
import java.util.concurrent.ExecutorService;
|
import java.util.stream.IntStream;
|
||||||
import java.util.concurrent.Executors;
|
|
||||||
import java.util.concurrent.Future;
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
|
||||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
|
import uk.ac.shef.wit.simmetrics.similaritymetrics.AbstractStringMetric;
|
||||||
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
|
import uk.ac.shef.wit.simmetrics.similaritymetrics.QGramsDistance;
|
||||||
|
|
||||||
import com.ibm.icu.text.Transliterator;
|
|
||||||
|
|
||||||
public class LocalSearch<T> {
|
public class LocalSearch<T> {
|
||||||
|
|
||||||
private static final ExecutorService localThreadPool = Executors.newWorkStealingPool();
|
private AbstractStringMetric metric = new QGramsDistance();
|
||||||
|
|
||||||
private final AbstractStringMetric metric = new QGramsDistance();
|
|
||||||
private float resultMinimumSimilarity = 0.5f;
|
private float resultMinimumSimilarity = 0.5f;
|
||||||
private int resultSetSize = 20;
|
private int resultSetSize = 20;
|
||||||
|
|
||||||
private final Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
|
private Transliterator transliterator = Transliterator.getInstance("Any-Latin;Latin-ASCII;[:Diacritic:]remove");
|
||||||
|
|
||||||
private final List<T> objects;
|
private List<T> objects;
|
||||||
private final List<Set<String>> fields;
|
private List<Set<String>> fields;
|
||||||
|
|
||||||
public LocalSearch(Collection<? extends T> data) {
|
public LocalSearch(Collection<? extends T> data) {
|
||||||
objects = new ArrayList<T>(data);
|
objects = new ArrayList<T>(data);
|
||||||
fields = new ArrayList<Set<String>>(objects.size());
|
fields = objects.stream().map(this::getFields).collect(toList());
|
||||||
|
|
||||||
for (int i = 0; i < objects.size(); i++) {
|
|
||||||
fields.add(i, getFields(objects.get(i)));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<T> search(String query) throws ExecutionException, InterruptedException {
|
public List<T> search(String q) throws ExecutionException, InterruptedException {
|
||||||
final String q = normalize(query);
|
String query = normalize(q);
|
||||||
List<Callable<Entry<T, Float>>> tasks = new ArrayList<Callable<Entry<T, Float>>>(objects.size());
|
|
||||||
|
|
||||||
for (int i = 0; i < objects.size(); i++) {
|
return IntStream.range(0, objects.size()).mapToObj(i -> {
|
||||||
final int index = i;
|
T object = objects.get(i);
|
||||||
tasks.add(new Callable<Entry<T, Float>>() {
|
Set<String> field = fields.get(i);
|
||||||
|
|
||||||
@Override
|
boolean match = field.stream().anyMatch(it -> it.contains(query));
|
||||||
public Entry<T, Float> call() throws Exception {
|
double similarity = field.stream().mapToDouble(it -> metric.getSimilarity(query, it)).max().orElse(0);
|
||||||
float similarity = 0;
|
|
||||||
boolean match = false;
|
|
||||||
|
|
||||||
for (String field : fields.get(index)) {
|
return match || similarity > resultMinimumSimilarity ? new SimpleImmutableEntry<T, Double>(object, similarity) : null;
|
||||||
match |= field.contains(q);
|
}).filter(Objects::nonNull).sorted(reverseOrder(comparing(Entry::getValue))).limit(resultSetSize).map(Entry::getKey).collect(toList());
|
||||||
similarity = Math.max(metric.getSimilarity(q, field), similarity);
|
|
||||||
}
|
|
||||||
|
|
||||||
return match || similarity > resultMinimumSimilarity ? new SimpleEntry<T, Float>(objects.get(index), similarity) : null;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
final List<Entry<T, Float>> resultSet = new ArrayList<Entry<T, Float>>(objects.size());
|
|
||||||
|
|
||||||
for (Future<Entry<T, Float>> entry : localThreadPool.invokeAll(tasks)) {
|
|
||||||
if (entry.get() != null) {
|
|
||||||
resultSet.add(entry.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Thread.interrupted()) {
|
|
||||||
throw new InterruptedException();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// sort by similarity descending (best matches first)
|
|
||||||
sort(resultSet, new Comparator<Entry<T, Float>>() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(Entry<T, Float> o1, Entry<T, Float> o2) {
|
|
||||||
return o2.getValue().compareTo(o1.getValue());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// view for the first 20 search results
|
|
||||||
return new AbstractList<T>() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public T get(int index) {
|
|
||||||
return resultSet.get(index).getKey();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int size() {
|
|
||||||
return Math.min(resultSetSize, resultSet.size());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setResultMinimumSimilarity(float resultMinimumSimilarity) {
|
public void setResultMinimumSimilarity(float resultMinimumSimilarity) {
|
||||||
|
Loading…
Reference in New Issue
Block a user