Skip to content

Commit 3e26277

Browse files
committed
Example code using the client
1 parent 6947b01 commit 3e26277

File tree

3 files changed

+339
-0
lines changed

3 files changed

+339
-0
lines changed

example/README.rst

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
Example code for `elasticsearch-py`
2+
===================================
3+
4+
This example code demonstrates the features and use patterns for the Python client.
5+
6+
To run this example make sure you have elasticsearch running on port 9200,
7+
install additional dependencies (on top of `elasticsearch-py`)::
8+
9+
pip install python-dateutil GitPython
10+
11+
And now you can load the index (the index will be called `git`)::
12+
13+
python load.py
14+
15+
This will create an index with mappings and parse the git information of this
16+
repository and load all the commits into it. You can run some sample queries by
17+
running::
18+
19+
python queries.py
20+
21+
Look at the `queries.py` file for querying example and `load.py` on examples on
22+
loading data into elasticsearch. Both `load` and `queries` set up logging so in
23+
`/tmp/es_trace.log` you will have a transcript of the commands being run in the
24+
curl format.
25+

example/load.py

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
from __future__ import print_function
4+
5+
from os.path import dirname, basename, abspath
6+
from itertools import chain
7+
from datetime import datetime
8+
import logging
9+
10+
import git
11+
12+
from elasticsearch import Elasticsearch
13+
from elasticsearch.helpers import bulk, streaming_bulk
14+
15+
def create_git_index(client, index):
16+
# create empty index
17+
client.indices.create(
18+
index=index,
19+
body={
20+
'settings': {
21+
# just one shard, no replicas for testing
22+
'number_of_shards': 1,
23+
'number_of_replicas': 0,
24+
25+
# custom analyzer for analyzing file paths
26+
'analysis': {
27+
'analyzer': {
28+
'file_path': {
29+
'type': 'custom',
30+
'tokenizer': 'path_hierarchy',
31+
'filter': ['lowercase']
32+
}
33+
}
34+
}
35+
}
36+
},
37+
# ignore already existing index
38+
ignore=400
39+
)
40+
41+
# we will use user on several places
42+
user_mapping = {
43+
'properties': {
44+
'name': {
45+
'type': 'multi_field',
46+
'fields': {
47+
'raw': {'type' : 'string', 'index' : 'not_analyzed'},
48+
'name': {'type' : 'string'}
49+
}
50+
}
51+
}
52+
}
53+
54+
client.indices.put_mapping(
55+
index=index,
56+
doc_type='repos',
57+
body={
58+
'repos': {
59+
'properties': {
60+
'owner': user_mapping,
61+
'created_at': {'type': 'date'},
62+
'description': {
63+
'type': 'string',
64+
'analyzer': 'snowball',
65+
},
66+
'tags': {
67+
'type': 'string',
68+
'index': 'not_analyzed'
69+
}
70+
}
71+
}
72+
}
73+
)
74+
75+
client.indices.put_mapping(
76+
index=index,
77+
doc_type='commits',
78+
body={
79+
'commits': {
80+
'_parent': {
81+
'type': 'repos'
82+
},
83+
'properties': {
84+
'author': user_mapping,
85+
'authored_date': {'type': 'date'},
86+
'committer': user_mapping,
87+
'committed_date': {'type': 'date'},
88+
'parent_shas': {'type': 'string', 'index' : 'not_analyzed'},
89+
'description': {'type': 'string', 'analyzer': 'snowball'},
90+
'files': {'type': 'string', 'analyzer': 'file_path'}
91+
}
92+
}
93+
}
94+
)
95+
96+
def parse_commits(repo, name):
97+
"""
98+
Go through the git repository log and generate a document per commit
99+
containing all the metadata.
100+
"""
101+
for commit in repo.log():
102+
yield {
103+
'_id': commit.id,
104+
'_parent': name,
105+
'committed_date': datetime(*commit.committed_date[:6]),
106+
'committer': {
107+
'name': commit.committer.name,
108+
'email': commit.committer.email,
109+
},
110+
'authored_date': datetime(*commit.authored_date[:6]),
111+
'author': {
112+
'name': commit.author.name,
113+
'email': commit.author.email,
114+
},
115+
'description': commit.message,
116+
'parent_shas': [p.id for p in commit.parents],
117+
# we only care about the filenames, not the per-file stats
118+
'files': list(chain(commit.stats.files)),
119+
'stats': commit.stats.total,
120+
}
121+
122+
def load_repo(client, path=None, index='git'):
123+
"""
124+
Parse a git repository with all it's commits and load it into elasticsearch
125+
using `client`. If the index doesn't exist it will be created.
126+
"""
127+
path = dirname(dirname(abspath(__file__))) if path is None else path
128+
repo_name = basename(path)
129+
repo = git.Repo(path)
130+
131+
create_git_index(client, index)
132+
133+
# create the parent document in case it doesn't exist
134+
client.create(
135+
index=index,
136+
doc_type='repos',
137+
id=repo_name,
138+
body={},
139+
ignore=409 # 409 - conflict - would be returned if the document is already there
140+
)
141+
142+
# we let the streaming bulk continuously process the commits as they come
143+
# in - since the `parse_commits` function is a generator this will avoid
144+
# loading all the commits into memory
145+
for ok, result in streaming_bulk(
146+
client,
147+
parse_commits(repo, repo_name),
148+
index=index,
149+
doc_type='commits',
150+
chunk_size=50 # keep the batch sizes small for appearances only
151+
):
152+
action, result = result.popitem()
153+
doc_id = '/%s/commits/%s' % (index, result['_id'])
154+
# process the information from ES whether the document has been
155+
# successfully indexed
156+
if not ok:
157+
print('Failed to %s document %s: %r' % (action, doc_id, result))
158+
else:
159+
print(doc_id)
160+
161+
162+
# we manually create es repo document and update elasticsearch-py to include metadata
163+
REPO_ACTIONS = [
164+
{'_type': 'repos', '_id': 'elasticsearch', '_source': {
165+
'owner': {'name': 'Shay Bannon', 'email': 'kimchy@gmail.com'},
166+
'created_at': datetime(2010, 2, 8, 15, 22, 27),
167+
'tags': ['search', 'distributed', 'lucene'],
168+
'description': 'You know, for search.'}
169+
},
170+
171+
{'_type': 'repos', '_id': 'elasticsearch-py', '_op_type': 'update', 'doc': {
172+
'owner': {'name': 'Honza Král', 'email': 'honza.kral@gmail.com'},
173+
'created_at': datetime(2013, 5, 1, 16, 37, 32),
174+
'tags': ['elasticsearch', 'search', 'python', 'client'],
175+
'description': 'For searching snakes.'}
176+
},
177+
]
178+
179+
if __name__ == '__main__':
180+
# get trace logger and set level
181+
tracer = logging.getLogger('elasticsearch.trace')
182+
tracer.setLevel(logging.INFO)
183+
tracer.addHandler(logging.FileHandler('/tmp/es_trace.log'))
184+
185+
# instantiate es client, connects to localhost:9200 by default
186+
es = Elasticsearch()
187+
188+
# we load the repo and all commits
189+
load_repo(es)
190+
191+
# run the bulk operations
192+
success, _ = bulk(es, REPO_ACTIONS, index='git', raise_on_error=True)
193+
print('Performed %d actions' % success)
194+
195+
# now we can retrieve the documents
196+
es_repo = es.get(index='git', doc_type='repos', id='elasticsearch')
197+
print('%s: %s' % (es_repo['_id'], es_repo['_source']['description']))
198+
199+
# update - add java to es tags
200+
es.update(
201+
index='git',
202+
doc_type='repos',
203+
id='elasticsearch',
204+
body={
205+
"script" : "ctx._source.tags += tag",
206+
"params" : {
207+
"tag" : "java"
208+
}
209+
}
210+
)
211+
212+
# refresh to make the documents available for search
213+
es.indices.refresh(index='git')
214+
215+
# and now we can count the documents
216+
print(es.count(index='git')['count'], 'documents in index')

example/queries.py

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#!/usr/bin/env python
2+
from __future__ import print_function
3+
4+
import logging
5+
from dateutil.parser import parse as parse_date
6+
7+
from elasticsearch import Elasticsearch
8+
9+
def print_hits(results, facet_masks={}):
10+
" Simple utility function to print results of a search query. "
11+
print('=' * 80)
12+
print('Total %d found in %dms' % (results['hits']['total'], results['took']))
13+
if results['hits']['hits']:
14+
print('-' * 80)
15+
for hit in results['hits']['hits']:
16+
# get created date for a repo and fallback to authored_date for a commit
17+
created_at = parse_date(hit['_source'].get('created_at', hit['_source']['authored_date']))
18+
print('/%s/%s/%s (%s): %s' % (
19+
hit['_index'], hit['_type'], hit['_id'],
20+
created_at.strftime('%Y-%m-%d'),
21+
hit['_source']['description'].replace('\n', ' ')))
22+
23+
for facet, mask in facet_masks.items():
24+
print('-' * 80)
25+
for d in results['facets'][facet]['terms']:
26+
print(mask % d)
27+
print('=' * 80)
28+
print()
29+
30+
# get trace logger and set level
31+
tracer = logging.getLogger('elasticsearch.trace')
32+
tracer.setLevel(logging.INFO)
33+
tracer.addHandler(logging.FileHandler('/tmp/es_trace.log'))
34+
# instantiate es client, connects to localhost:9200 by default
35+
es = Elasticsearch()
36+
37+
print('Empty search:')
38+
print_hits(es.search(index='git'))
39+
40+
print('Last 8 Commits for elasticsearch-py:')
41+
result = es.search(
42+
index='git',
43+
doc_type='commits',
44+
body={
45+
'query': {
46+
'filtered': {
47+
'filter': {
48+
'term': {
49+
# parent ref is stored as type#id
50+
'_parent': 'repos#elasticsearch-py'
51+
}
52+
}
53+
}
54+
},
55+
'sort': [
56+
{'committed_date': {'order': 'desc'}}
57+
],
58+
'size': 8
59+
}
60+
)
61+
print_hits(result)
62+
63+
print('Stats for top 10 python committers:')
64+
result = es.search(
65+
index='git',
66+
doc_type='commits',
67+
body={
68+
'size': 0,
69+
'query': {
70+
'filtered': {
71+
'filter': {
72+
'has_parent': {
73+
'type': 'repos',
74+
'query': {
75+
'filtered': {
76+
'filter': {
77+
'term': {
78+
'tags': 'python'
79+
}
80+
}
81+
}
82+
}
83+
}
84+
}
85+
}
86+
},
87+
'facets': {
88+
'committers': {
89+
'terms_stats': {
90+
'key_field': 'committer.name.raw',
91+
'value_field': 'stats.lines'
92+
}
93+
}
94+
}
95+
}
96+
)
97+
print_hits(result, {'committers': '%(term)15s: %(count)3d commits changing %(total)6d lines'})
98+

0 commit comments

Comments
 (0)