Commit 7c6af7a8 authored by Duncan White's avatar Duncan White

various changes: split out Defns and used Function::Parameters..

parent 1b20e434
package Defns;
use strict;
use warnings;
use Function::Parameters;
use JSON;
use Exporter 'import';
our @EXPORT = qw($elastic deleteall makesig bulkinsert
insertws searchq queryfreq);
our $elastic = "http://146.169.44.185:9200/words/anagrams";
#
# deleteall(); delete all records from $elastic
#
fun deleteall()
{
system( "curl -s -XDELETE $elastic >/dev/null" );
}
#
# my $sig = makesig( $word ): make the anagram signature of a word
#
fun makesig( $word )
{
my $result = join( '', sort( split( //, $word ) ) );
return $result;
}
#
# bulkinsert( filename ): bulkinsert the given file (in right format)
#
fun bulkinsert( $filename )
{
my $cmd = "curl -s -XPOST $elastic/_bulk --data-binary \@$filename >/dev/null";
print "debug: cmd:$cmd\n";
system( $cmd );
}
#
# insertws( $word, $sig, $n ); insert (word: $word, sig: $sig) JSON record
#
fun insertws( $word, $sig, $n )
{
my $cmd = qq(curl -s -XPUT $elastic/$n -d '{ "sig": "$sig", "word": "$word" }');
print "debug: $cmd\n" if $n % 10000 == 0;
system( "$cmd > /dev/null 2>&1" );
}
#
# my $data = searchq( $query );
# [for example, query might be "sig:$sig"]
# perform a query_string search, grab the results and parse
# the returned JSON and return the decoded hashref/arrayref
#
fun searchq( $query )
{
my $cmd = qq(curl -s -XGET $elastic/_search\?q=$query);
#doesn't work:..&fields=sig,word&_source=false);
my $results =`$cmd`;
#print "debug: results:$results\n";
my $data = decode_json( $results );
return $data;
}
#
# my $data = queryfreq( $field );
# using a terms counting facet, query distinct values of a
# field (eg "sig") and the associated record frequencies.
# return the JSON result $data, in which there'll be a
# $data->{facet}->{$field}->{terms} array of
# { term=>distinct fieldvalue, count=>frequency of that value }
#
fun queryfreq( $field )
{
# rough equivalent of SQL
# "select $field,count($field) from anagrams group by $field"
my $cmd = qq(curl -s -XGET $elastic/_search -d '{ query: { match_all: {} }, facets: { $field: { terms: { field: "$field", size: 1000000 } } } }');
my $results = `$cmd`;
my $data = from_json( $results );
return $data;
}
1;
#!/usr/bin/perl
use strict;
use warnings;
use open ':locale';
use Defns;
my $wordlist = shift @ARGV || '/usr/share/dict/words';
deleteall();
open( my $out, '>', '/tmp/bulkdata' ) || die;
#
# for every line (word) in the dictionary, and it's line number n
#
open( my $in, '<', $wordlist ) || die;
my %seen;
my $n = 0;
while( my $word = <$in> )
{
# sanitise the word a bit
chomp $word;
$word =~ tr/'//d;
$word = lc($word);
next if $seen{$word}++;
$n++;
# calculate a signature [all the letters in sorted order]
my $sig = makesig( $word );
# append (word: word, sig: signature) JSON records to bulk data file
my $entry = qq({ "create" : { "_index": "words", "_type": "anagrams").
qq(, "_id": "$n" } }\n{ "sig":"$sig", "word":"$word" }\n);
print "debug: $entry\n" if $n % 10000 == 0;
print $out $entry;
}
close( $in );
close( $out );
print "doing bulk insert:\n";
bulkinsert( "/tmp/bulkdata" );
print "$n distinct words\n";
#!/usr/bin/perl
use strict;
use warnings;
use JSON;
use Data::Dumper;
#use JSON;
#use Data::Dumper;
use Defns;
my $elastic = "http://146.169.44.228:9200/words/anagrams";
my @wordlist = @ARGV;
@wordlist = qw(dog later) if @wordlist == 0;
......@@ -13,14 +14,10 @@ foreach my $word (@wordlist)
$word = lc($word);
# calculate the signature [all the letters in sorted order]
my $sig = join( '', sort( split( //, $word ) ) );
my $sig = makesig($word);
my $cmd = qq(curl -s -XGET $elastic/_search\?q=sig:$sig);
#doesn't work:..&fields=sig,word&_source=false);
my $data = searchq( "sig:$sig" );
my $results =`$cmd`;
#print "debug: results:$results\n";
my $data = decode_json( $results );
my $aref = $data->{hits}->{hits};
#die Dumper $aref;
my @w = map { $_->{_source}->{word} } @$aref;
......
#!/usr/bin/perl
use strict;
use warnings;
use JSON;
use Data::Dumper;
#use JSON;
#use Data::Dumper;
my $elastic = "http://146.169.44.228:9200/words/anagrams";
use Defns;
# query for signatures and their frequencies
my $cmd = qq(curl -s -XGET $elastic/_search -d '{ query: { match_all: {} }, facets: { sig: { terms: { field: "sig", size: 100000 } } } }');
my $results = `$cmd`;
my $data = from_json( $results );
my $data = queryfreq( "sig" );
# extract (from the JSON result) the array of terms whose count>1
my $aref = $data->{facets}->{sig}->{terms};
my @sig = map { $_->{term} } grep { $_->{count} > 1 } @$aref;
......@@ -20,9 +19,7 @@ my @sig = map { $_->{term} } grep { $_->{count} > 1 } @$aref;
foreach my $sig (@sig)
{
# fetch all words with that signature and display them and the sig
my $cmd = qq(curl -s -XGET $elastic/_search\?q=sig:$sig);
my $results = `$cmd`;
my $data = from_json( $results );
my $data = searchq( "sig:$sig" );
my $aref = $data->{hits}->{hits};
my @w = map { $_->{_source}->{word} } @$aref;
print "$sig: @w\n";
......
#!/usr/bin/perl
use strict;
use warnings;
use JSON;
#use JSON;
use List::Util qw(max);
use Data::Dumper;
my $elastic = "http://146.169.44.228:9200/words/anagrams";
use Defns;
# query for distinct signatures and their frequencies;
# rough equivalent of SQL "select sig,count(sig) from anagrams group by sig"
my $cmd = qq(curl -s -XGET $elastic/_search -d '{ query: { match_all: {} }, facets: { sig: { terms: { field: "sig", size: 100000 } } } }');
my $results = `$cmd`;
my $data = from_json( $results );
my $data = queryfreq( "sig" );
# extract (from the JSON result) the array of [term,count] pairs
# (where term is a distinct signature, and count is the no of words
......@@ -32,9 +28,7 @@ my @maxsig = map { $_->{term} } grep { $_->{count} == $maxcount } @tc;
foreach my $sig (@maxsig)
{
# fetch all the words with this signature
my $cmd = qq(curl -s -XGET $elastic/_search\?q=sig:$sig);
my $results = `$cmd`;
my $data = from_json( $results );
my $data = searchq( "sig:$sig" );
my $aref = $data->{hits}->{hits};
my @w = map { $_->{_source}->{word} } @$aref;
print "$maxcount: @w\n";
......
......@@ -3,10 +3,11 @@ use strict;
use warnings;
use open ':locale';
my $elastic = "http://146.169.44.228:9200/words/anagrams";
use Defns;
my $wordlist = shift @ARGV || '/usr/share/dict/words';
system( "curl -s -XDELETE $elastic >/dev/null" );
deleteall();
#
# for every line (word) in the dictionary, and it's line number n
......@@ -24,12 +25,11 @@ while( my $word = <$in> )
$n++;
# calculate a signature [all the letters in sorted order]
my $sig = join( '', sort( split( //, $word ) ) );
my $sig = makesig( $word );
#my $sig = join( '', sort( split( //, $word ) ) );
# insert (word: word, sig: signature) JSON records
my $cmd = qq(curl -s -XPUT $elastic/$n -d '{ "sig": "$sig", "word": "$word" }');
print "debug: $cmd\n" if $n % 10000 == 0;
system( "$cmd > /dev/null 2>&1" );
insertws( $word, $sig, $n );
}
close( $in );
print "$n distinct words\n";
#!/bin/sh -
curl -XGET http://146.169.44.185:9200/_cat/nodes\?h=ip,port,heapPercent,name
curl -XGET http://146.169.44.185:9200/_cat/master\?v
curl -XGET http://146.169.44.185:9200/_cat/indices
curl -XGET http://146.169.44.185:9200/_cat/aliases
curl -XGET http://146.169.44.185:9200/_cat/allocation\?v
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment