Commit 1b20e434 authored by Duncan White's avatar Duncan White

anagrams using elastic search

parents
#!/usr/bin/perl
use strict;
use warnings;
use JSON;
use Data::Dumper;
my $elastic = "http://146.169.44.228:9200/words/anagrams";
my @wordlist = @ARGV;
@wordlist = qw(dog later) if @wordlist == 0;
foreach my $word (@wordlist)
{
$word = lc($word);
# calculate the signature [all the letters in sorted order]
my $sig = join( '', sort( split( //, $word ) ) );
my $cmd = qq(curl -s -XGET $elastic/_search\?q=sig:$sig);
#doesn't work:..&fields=sig,word&_source=false);
my $results =`$cmd`;
#print "debug: results:$results\n";
my $data = decode_json( $results );
my $aref = $data->{hits}->{hits};
#die Dumper $aref;
my @w = map { $_->{_source}->{word} } @$aref;
print "$word: @w\n";
}
#!/usr/bin/perl
use strict;
use warnings;
use JSON;
use Data::Dumper;
my $elastic = "http://146.169.44.228:9200/words/anagrams";
# query for signatures and their frequencies
my $cmd = qq(curl -s -XGET $elastic/_search -d '{ query: { match_all: {} }, facets: { sig: { terms: { field: "sig", size: 100000 } } } }');
my $results = `$cmd`;
my $data = from_json( $results );
my $aref = $data->{facets}->{sig}->{terms};
my @sig = map { $_->{term} } grep { $_->{count} > 1 } @$aref;
#
# for every repeated signature
#
foreach my $sig (@sig)
{
# fetch all words with that signature and display them and the sig
my $cmd = qq(curl -s -XGET $elastic/_search\?q=sig:$sig);
my $results = `$cmd`;
my $data = from_json( $results );
my $aref = $data->{hits}->{hits};
my @w = map { $_->{_source}->{word} } @$aref;
print "$sig: @w\n";
}
#!/usr/bin/perl
use strict;
use warnings;
use JSON;
use List::Util qw(max);
use Data::Dumper;
my $elastic = "http://146.169.44.228:9200/words/anagrams";
# query for distinct signatures and their frequencies;
# rough equivalent of SQL "select sig,count(sig) from anagrams group by sig"
my $cmd = qq(curl -s -XGET $elastic/_search -d '{ query: { match_all: {} }, facets: { sig: { terms: { field: "sig", size: 100000 } } } }');
my $results = `$cmd`;
my $data = from_json( $results );
# extract (from the JSON result) the array of [term,count] pairs
# (where term is a distinct signature, and count is the no of words
# with that signature,
my $aref = $data->{facets}->{sig}->{terms};
my @tc = @$aref;
# find the maximum count (i.e. the size of the biggest anagram set)
my $maxcount = max( map { $_->{count} } @tc );
# find all the signatures with that maximal count
my @maxsig = map { $_->{term} } grep { $_->{count} == $maxcount } @tc;
#
# for every maximal-count signature
#
foreach my $sig (@maxsig)
{
# fetch all the words with this signature
my $cmd = qq(curl -s -XGET $elastic/_search\?q=sig:$sig);
my $results = `$cmd`;
my $data = from_json( $results );
my $aref = $data->{hits}->{hits};
my @w = map { $_->{_source}->{word} } @$aref;
print "$maxcount: @w\n";
}
#!/usr/bin/perl
use strict;
use warnings;
use open ':locale';
my $elastic = "http://146.169.44.228:9200/words/anagrams";
my $wordlist = shift @ARGV || '/usr/share/dict/words';
system( "curl -s -XDELETE $elastic >/dev/null" );
#
# for every line (word) in the dictionary, and it's line number n
#
open( my $in, '<', $wordlist ) || die;
my %seen;
my $n = 0;
while( my $word = <$in> )
{
# sanitise the word a bit
chomp $word;
$word =~ tr/'//d;
$word = lc($word);
next if $seen{$word}++;
$n++;
# calculate a signature [all the letters in sorted order]
my $sig = join( '', sort( split( //, $word ) ) );
# insert (word: word, sig: signature) JSON records
my $cmd = qq(curl -s -XPUT $elastic/$n -d '{ "sig": "$sig", "word": "$word" }');
print "debug: $cmd\n" if $n % 10000 == 0;
system( "$cmd > /dev/null 2>&1" );
}
close( $in );
print "$n distinct words\n";
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment