A general purpose R interface to Elasticsearch
- This client is being developed under v1.0 of Elasticsearch.
- It is early days for this client, so do help us by submitting bug reports and feature requests on the issue tracker.
To avoid potential conflicts with other R packges, this package adds es_
as a prefix to every function.
Install
Install dependencies
install.packages(c("rjson","plyr","httr"))
Install elastic
install.packages("devtools")
library(devtools)
install_github("ropensci/elastic")
library(elastic)
Install Elasticsearch (on OSX)
- Download zip or tar file from Elasticsearch see here for download
- Unzip it:
unzip
oruntar
- Move it:
sudo mv /path/to/elasticsearch-1.1.1 /usr/local
(replace version with your verioon) - Navigate to /usr/local:
cd /usr/local
- Add shortcut:
sudo ln -s elasticsearch-1.1.1 elasticsearch
(replace version with your verioon)
Start Elasticsearch
- Navigate to elasticsearch:
cd /usr/local/elasticsearch
- Start elasticsearch:
bin/elasticsearch
I create a little bash shortcut called es
that does both of the above commands in one step.
The function es_connect
is used before doing anything else to set the connection details to your remote or local elasticsearch store. The details created by es_connect
are written to your options for the current session, and are used by elastic
functions.
es_connect()
es_search(index="twitter")
matches -> 6
score -> 1
$took
[1] 1
$timed_out
[1] FALSE
$`_shards`
$`_shards`$total
[1] 5
$`_shards`$successful
[1] 5
$`_shards`$failed
[1] 0
$hits
$hits$total
[1] 6
$hits$max_score
[1] 1
$hits$hits
$hits$hits[[1]]
$hits$hits[[1]]$`_index`
[1] "twitter"
es_search(index="twitter", type="tweet", sort="message")
matches -> 3
score -> NA
$took
[1] 2
$timed_out
[1] FALSE
$`_shards`
$`_shards`$total
[1] 5
$`_shards`$successful
[1] 5
$`_shards`$failed
[1] 0
$hits
$hits$total
[1] 3
$hits$max_score
NULL
$hits$hits
$hits$hits[[1]]
$hits$hits[[1]]$`_index`
[1] "twitter"
$hits$hits[[1]]$`_type`
[1] "tweet"
$hits$hits[[1]]$`_id`
[1] "3"
$hits$hits[[1]]$`_score`
NULL
$hits$hits[[1]]$`_source`
$hits$hits[[1]]$`_source`$user
[1] "jane"
$hits$hits[[1]]$`_source`$post_date
[1] "2009-11-15T14:12:12"
...
Get document with id=1
es_get(index='twitter', type='tweet', id=1)
http://127.0.0.1:9200/?=
$ok
[1] TRUE
$status
[1] 200
$name
[1] "Simon Williams"
$version
$version$number
[1] "0.90.11"
$version$build_hash
[1] "11da1bacf39cec400fd97581668acb2c5450516c"
$version$build_timestamp
[1] "2014-02-03T15:27:39Z"
$version$build_snapshot
[1] FALSE
$version$lucene_version
[1] "4.6"
$tagline
[1] "You Know, for Search"
attr(,"class")
[1] "elastic"
Get certain fields
es_get(index='twitter', type='tweet', id=1, fields='user')
http://127.0.0.1:9200/?fields=user
$ok
[1] TRUE
$status
[1] 200
$name
[1] "Simon Williams"
$version
$version$number
[1] "0.90.11"
$version$build_hash
[1] "11da1bacf39cec400fd97581668acb2c5450516c"
$version$build_timestamp
[1] "2014-02-03T15:27:39Z"
$version$build_snapshot
[1] FALSE
$version$lucene_version
[1] "4.6"
$tagline
[1] "You Know, for Search"
attr(,"class")
[1] "elastic"
Test for existence of the document
es_get(index='twitter', type='tweet', id=1, exists=TRUE)
200 - OK
Same index and type, different document ids
es_mget(index="twitter", type="tweet", id=1:2)
$docs
$docs[[1]]
$docs[[1]]$`_index`
[1] "twitter"
$docs[[1]]$`_type`
[1] "tweet"
$docs[[1]]$`_id`
[1] "1"
$docs[[1]]$`_version`
[1] 1
$docs[[1]]$exists
[1] TRUE
$docs[[1]]$`_source`
$docs[[1]]$`_source`$user
[1] "kimchy"
$docs[[1]]$`_source`$post_date
[1] "2009-11-15T14:12:12"
$docs[[1]]$`_source`$message
[1] "trying out Elasticsearch"
$docs[[2]]
$docs[[2]]$`_index`
[1] "twitter"
$docs[[2]]$`_type`
[1] "tweet"
$docs[[2]]$`_id`
[1] "2"
$docs[[2]]$`_version`
[1] 1
$docs[[2]]$exists
[1] TRUE
$docs[[2]]$`_source`
$docs[[2]]$`_source`$user
[1] "scott"
$docs[[2]]$`_source`$post_date
[1] "2009-11-15T14:12:12"
$docs[[2]]$`_source`$message
[1] "what shit what what"
Different indeces, types, and ids
es_mget(index_type_id=list(c("twitter","mention",1), c("appdotnet","share",1)))
$docs
$docs[[1]]
$docs[[1]]$`_index`
[1] "twitter"
$docs[[1]]$`_type`
[1] "mention"
$docs[[1]]$`_id`
[1] "1"
$docs[[1]]$`_version`
[1] 1
$docs[[1]]$exists
[1] TRUE
$docs[[1]]$`_source`
$docs[[1]]$`_source`$user
[1] "sam"
$docs[[1]]$`_source`$post_date
[1] "2009-11-15T14:12:12"
$docs[[1]]$`_source`$message
[1] "lorum ipsum"
$docs[[2]]
$docs[[2]]$`_index`
[1] "appdotnet"
$docs[[2]]$`_type`
[1] "share"
$docs[[2]]$`_id`
[1] "1"
$docs[[2]]$`_version`
[1] 1
$docs[[2]]$exists
[1] TRUE
$docs[[2]]$`_source`
$docs[[2]]$`_source`$user
[1] "bob"
$docs[[2]]$`_source`$post_date
[1] "2009-11-15T14:12:12"
$docs[[2]]$`_source`$message
[1] "hello world"
es_parse
is a general purpose parser function with extension methods es_parse.es_search
, es_parse.es_get
, and es_parse.es_mget
, for parsing es_search
, es_get
, and es_mget
function output, respectively. es_parse
is used internally within those three functions (es_search
, es_get
, es_mget
) to do parsing. You can optionally get back raw json
from es_search
, es_get
, and es_mget
setting parameter raw=TRUE
, and then parsing after with es_parse
.
For example:
(out <- es_mget(index="twitter", type="tweet", id=1:2, raw=TRUE))
[1] "{\"docs\":[{\"_index\":\"twitter\",\"_type\":\"tweet\",\"_id\":\"1\",\"error\":\"NoShardAvailableActionException[[twitter][2] null]\"},{\"_index\":\"twitter\",\"_type\":\"tweet\",\"_id\":\"2\",\"error\":\"NoShardAvailableActionException[[twitter][3] null]\"}]}"
attr(,"class")
[1] "elastic_mget"
Then parse
es_parse(out)
$docs
$docs[[1]]
$docs[[1]]$`_index`
[1] "twitter"
$docs[[1]]$`_type`
[1] "tweet"
$docs[[1]]$`_id`
[1] "1"
$docs[[1]]$`_version`
[1] 1
$docs[[1]]$exists
[1] TRUE
$docs[[1]]$`_source`
$docs[[1]]$`_source`$user
[1] "kimchy"
$docs[[1]]$`_source`$post_date
[1] "2009-11-15T14:12:12"
$docs[[1]]$`_source`$message
[1] "trying out Elasticsearch"
$docs[[2]]
$docs[[2]]$`_index`
[1] "twitter"
$docs[[2]]$`_type`
[1] "tweet"
$docs[[2]]$`_id`
[1] "2"
$docs[[2]]$`_version`
[1] 1
$docs[[2]]$exists
[1] TRUE
$docs[[2]]$`_source`
$docs[[2]]$`_source`$user
[1] "scott"
$docs[[2]]$`_source`$post_date
[1] "2009-11-15T14:12:12"
$docs[[2]]$`_source`$message
[1] "what shit what what"
- Navigate to elastisearch dir:
cd elasticsearch
- Install it:
bin/plugin -install elasticsearch/elasticsearch-river-couchdb/2.0.0.RC1
- Navigate to elasticsearch:
cd /usr/local/elasticsearch
- Start elasticsearch:
bin/elasticsearch -f
Edit details and paste into terminal and execute
curl -XPUT 'localhost:9200/_river/rplos_db/_meta' -d '{ "type" : "couchdb", "couchdb" : { "host" : "localhost", "port" : 5984, "db" : "rplos_db", "filter" : null } }'
es_connect(es_base="localhost", es_port=9200)
# your stringified JSON query - recommend developing the queries with the mobz/head plugin. Then just paste the valid json into an R string object
sample_json <- '{
"size":100,
"query": {
"match_all": {}
}
}'
data <- es_search_v2(index="activity_instance", type="jdbc", data=sample_json)
curl -XGET "http://localhost:9200/sofadb/_search?q=road&pretty=true"
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 0.614891,
"hits" : [ {
"_index" : "sofadb",
"_type" : "sofadb",
"_id" : "a1812100bd1dba00c2ed1cd507000277",
"_score" : 0.614891, "_source" : {"_rev":"1-5406480672da172726810767e7d0ead3","_id":"a1812100bd1dba00c2ed1cd507000277","name":"sofa","icecream":"rocky road"}
}, {
"_index" : "sofadb",
"_type" : "sofadb",
"_id" : "a1812100bd1dba00c2ed1cd507000b92",
"_score" : 0.13424811, "_source" : {"_rev":"1-5406480672da172726810767e7d0ead3","_id":"a1812100bd1dba00c2ed1cd507000b92","name":"sofa","icecream":"rocky road"}
} ]
}
}