A very simple approach would be to find some kind of centroid for each cluster (e.g. averaging the distributions of the documents belonging to each cluster respectively) and then calculating the cosine distance of each document within the cluster from the corresponding centroid. The document with the shorter distance will be the closest to the centroid, hence the most "representative".
Continuing from the previous example:
import pandas as pd import numpy as np from sklearn.metrics import pairwise_distances from scipy.spatial.distance import cosine from sklearn.cluster import DBSCAN from sklearn.preprocessing import StandardScaler # Initialize some documents doc1 = {'Science':0.8, 'History':0.05, 'Politics':0.15, 'Sports':0.1} doc2 = {'News':0.2, 'Art':0.8, 'Politics':0.1, 'Sports':0.1} doc3 = {'Science':0.8, 'History':0.1, 'Politics':0.05, 'News':0.1} doc4 = {'Science':0.1, 'Weather':0.2, 'Art':0.7, 'Sports':0.1} collection = [doc1, doc2, doc3, doc4] df = pd.DataFrame(collection) # Fill missing values with zeros df.fillna(0, inplace=True) # Get Feature Vectors feature_matrix = df.as_matrix() # Fit DBSCAN db = DBSCAN(min_samples=1, metric='precomputed').fit(pairwise_distances(feature_matrix, metric='cosine')) labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) # Find the representatives representatives = {} for label in set(labels): # Find indices of documents belonging to the same cluster ind = np.argwhere(labels==label).reshape(-1,) # Select these specific documetns cluster_samples = feature_matrix[ind,:] # Calculate their centroid as an average centroid = np.average(cluster_samples, axis=0) # Find the distance of each document from the centroid distances = [cosine(sample_doc, centroid) for sample_doc in cluster_samples] # Keep the document closest to the centroid as the representative representatives[label] = cluster_samples[np.argsort(distances),:][0] for label, doc in representatives.iteritems(): print("Label : %d -- Representative : %s" % (label, str(doc)))