grasskode
diff --git a/‎example/README.rst‎
Lines changed: 25 additions & 0 deletions b/‎example/README.rst‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎example/load.py‎
Lines changed: 216 additions & 0 deletions b/‎example/load.py‎
Lines changed: 216 additions & 0 deletions
diff --git a/‎example/queries.py‎
Lines changed: 98 additions & 0 deletions b/‎example/queries.py‎
Lines changed: 98 additions & 0 deletions
@@ -0,0 +1,25 @@
+Example code for `elasticsearch-py`
+===================================
+
+This example code demonstrates the features and use patterns for the Python client.
+
+To run this example make sure you have elasticsearch running on port 9200,
+install additional dependencies (on top of `elasticsearch-py`)::
+
+ pip install python-dateutil GitPython
+
+And now you can load the index (the index will be called `git`)::
+
+ python load.py
+
+This will create an index with mappings and parse the git information of this
+repository and load all the commits into it. You can run some sample queries by
+running::
+
+ python queries.py
+
+Look at the `queries.py` file for querying example and `load.py` on examples on
+loading data into elasticsearch. Both `load` and `queries` set up logging so in
+`/tmp/es_trace.log` you will have a transcript of the commands being run in the
+curl format.
+
@@ -0,0 +1,216 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+from os.path import dirname, basename, abspath
+from itertools import chain
+from datetime import datetime
+import logging
+
+import git
+
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk, streaming_bulk
+
+def create_git_index(client, index):
+ # create empty index
+ client.indices.create(
+ index=index,
+ body={
+ 'settings': {
+ # just one shard, no replicas for testing
+ 'number_of_shards': 1,
+ 'number_of_replicas': 0,
+
+ # custom analyzer for analyzing file paths
+ 'analysis': {
+ 'analyzer': {
+ 'file_path': {
+ 'type': 'custom',
+ 'tokenizer': 'path_hierarchy',
+ 'filter': ['lowercase']
+ }
+ }
+ }
+ }
+ },
+ # ignore already existing index
+ ignore=400
+ )
+
+ # we will use user on several places
+ user_mapping = {
+ 'properties': {
+ 'name': {
+ 'type': 'multi_field',
+ 'fields': {
+ 'raw': {'type' : 'string', 'index' : 'not_analyzed'},
+ 'name': {'type' : 'string'}
+ }
+ }
+ }
+ }
+
+ client.indices.put_mapping(
+ index=index,
+ doc_type='repos',
+ body={
+ 'repos': {
+ 'properties': {
+ 'owner': user_mapping,
+ 'created_at': {'type': 'date'},
+ 'description': {
+ 'type': 'string',
+ 'analyzer': 'snowball',
+ },
+ 'tags': {
+ 'type': 'string',
+ 'index': 'not_analyzed'
+ }
+ }
+ }
+ }
+ )
+
+ client.indices.put_mapping(
+ index=index,
+ doc_type='commits',
+ body={
+ 'commits': {
+ '_parent': {
+ 'type': 'repos'
+ },
+ 'properties': {
+ 'author': user_mapping,
+ 'authored_date': {'type': 'date'},
+ 'committer': user_mapping,
+ 'committed_date': {'type': 'date'},
+ 'parent_shas': {'type': 'string', 'index' : 'not_analyzed'},
+ 'description': {'type': 'string', 'analyzer': 'snowball'},
+ 'files': {'type': 'string', 'analyzer': 'file_path'}
+ }
+ }
+ }
+ )
+
+def parse_commits(repo, name):
+ """
+ Go through the git repository log and generate a document per commit
+ containing all the metadata.
+ """
+ for commit in repo.log():
+ yield {
+ '_id': commit.id,
+ '_parent': name,
+ 'committed_date': datetime(*commit.committed_date[:6]),
+ 'committer': {
+ 'name': commit.committer.name,
+ 'email': commit.committer.email,
+ },
+ 'authored_date': datetime(*commit.authored_date[:6]),
+ 'author': {
+ 'name': commit.author.name,
+ 'email': commit.author.email,
+ },
+ 'description': commit.message,
+ 'parent_shas': [p.id for p in commit.parents],
+ # we only care about the filenames, not the per-file stats
+ 'files': list(chain(commit.stats.files)),
+ 'stats': commit.stats.total,
+ }
+
+def load_repo(client, path=None, index='git'):
+ """
+ Parse a git repository with all it's commits and load it into elasticsearch
+ using `client`. If the index doesn't exist it will be created.
+ """
+ path = dirname(dirname(abspath(__file__))) if path is None else path
+ repo_name = basename(path)
+ repo = git.Repo(path)
+
+ create_git_index(client, index)
+
+ # create the parent document in case it doesn't exist
+ client.create(
+ index=index,
+ doc_type='repos',
+ id=repo_name,
+ body={},
+ ignore=409 # 409 - conflict - would be returned if the document is already there
+ )
+
+ # we let the streaming bulk continuously process the commits as they come
+ # in - since the `parse_commits` function is a generator this will avoid
+ # loading all the commits into memory
+ for ok, result in streaming_bulk(
+ client,
+ parse_commits(repo, repo_name),
+ index=index,
+ doc_type='commits',
+ chunk_size=50 # keep the batch sizes small for appearances only
+ ):
+ action, result = result.popitem()
+ doc_id = '/%s/commits/%s' % (index, result['_id'])
+ # process the information from ES whether the document has been
+ # successfully indexed
+ if not ok:
+ print('Failed to %s document %s: %r' % (action, doc_id, result))
+ else:
+ print(doc_id)
+
+
+# we manually create es repo document and update elasticsearch-py to include metadata
+REPO_ACTIONS = [
+ {'_type': 'repos', '_id': 'elasticsearch', '_source': {
+ 'owner': {'name': 'Shay Bannon', 'email': 'kimchy@gmail.com'},
+ 'created_at': datetime(2010, 2, 8, 15, 22, 27),
+ 'tags': ['search', 'distributed', 'lucene'],
+ 'description': 'You know, for search.'}
+ },
+
+ {'_type': 'repos', '_id': 'elasticsearch-py', '_op_type': 'update', 'doc': {
+ 'owner': {'name': 'Honza Král', 'email': 'honza.kral@gmail.com'},
+ 'created_at': datetime(2013, 5, 1, 16, 37, 32),
+ 'tags': ['elasticsearch', 'search', 'python', 'client'],
+ 'description': 'For searching snakes.'}
+ },
+]
+
+if __name__ == '__main__':
+ # get trace logger and set level
+ tracer = logging.getLogger('elasticsearch.trace')
+ tracer.setLevel(logging.INFO)
+ tracer.addHandler(logging.FileHandler('/tmp/es_trace.log'))
+
+ # instantiate es client, connects to localhost:9200 by default
+ es = Elasticsearch()
+
+ # we load the repo and all commits
+ load_repo(es)
+
+ # run the bulk operations
+ success, _ = bulk(es, REPO_ACTIONS, index='git', raise_on_error=True)
+ print('Performed %d actions' % success)
+
+ # now we can retrieve the documents
+ es_repo = es.get(index='git', doc_type='repos', id='elasticsearch')
+ print('%s: %s' % (es_repo['_id'], es_repo['_source']['description']))
+
+ # update - add java to es tags
+ es.update(
+ index='git',
+ doc_type='repos',
+ id='elasticsearch',
+ body={
+ "script" : "ctx._source.tags += tag",
+ "params" : {
+ "tag" : "java"
+ }
+ }
+ )
+
+ # refresh to make the documents available for search
+ es.indices.refresh(index='git')
+
+ # and now we can count the documents
+ print(es.count(index='git')['count'], 'documents in index')
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+from __future__ import print_function
+
+import logging
+from dateutil.parser import parse as parse_date
+
+from elasticsearch import Elasticsearch
+
+def print_hits(results, facet_masks={}):
+ " Simple utility function to print results of a search query. "
+ print('=' * 80)
+ print('Total %d found in %dms' % (results['hits']['total'], results['took']))
+ if results['hits']['hits']:
+ print('-' * 80)
+ for hit in results['hits']['hits']:
+ # get created date for a repo and fallback to authored_date for a commit
+ created_at = parse_date(hit['_source'].get('created_at', hit['_source']['authored_date']))
+ print('/%s/%s/%s (%s): %s' % (
+ hit['_index'], hit['_type'], hit['_id'],
+ created_at.strftime('%Y-%m-%d'),
+ hit['_source']['description'].replace('\n', ' ')))
+
+ for facet, mask in facet_masks.items():
+ print('-' * 80)
+ for d in results['facets'][facet]['terms']:
+ print(mask % d)
+ print('=' * 80)
+ print()
+
+# get trace logger and set level
+tracer = logging.getLogger('elasticsearch.trace')
+tracer.setLevel(logging.INFO)
+tracer.addHandler(logging.FileHandler('/tmp/es_trace.log'))
+# instantiate es client, connects to localhost:9200 by default
+es = Elasticsearch()
+
+print('Empty search:')
+print_hits(es.search(index='git'))
+
+print('Last 8 Commits for elasticsearch-py:')
+result = es.search(
+ index='git',
+ doc_type='commits',
+ body={
+ 'query': {
+ 'filtered': {
+ 'filter': {
+ 'term': {
+ # parent ref is stored as type#id
+ '_parent': 'repos#elasticsearch-py'
+ }
+ }
+ }
+ },
+ 'sort': [
+ {'committed_date': {'order': 'desc'}}
+ ],
+ 'size': 8
+ }
+)
+print_hits(result)
+
+print('Stats for top 10 python committers:')
+result = es.search(
+ index='git',
+ doc_type='commits',
+ body={
+ 'size': 0,
+ 'query': {
+ 'filtered': {
+ 'filter': {
+ 'has_parent': {
+ 'type': 'repos',
+ 'query': {
+ 'filtered': {
+ 'filter': {
+ 'term': {
+ 'tags': 'python'
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ },
+ 'facets': {
+ 'committers': {
+ 'terms_stats': {
+ 'key_field': 'committer.name.raw',
+ 'value_field': 'stats.lines'
+ }
+ }
+ }
+ }
+)
+print_hits(result, {'committers': '%(term)15s: %(count)3d commits changing %(total)6d lines'})
+