Guest User

Untitled

a guest
Sep 20th, 2014
400
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.10 KB | None | 0 0
  1. import os
  2. import time
  3. from bs4 import BeautifulSoup, SoupStrainer
  4.  
  5. from site_base import Site
  6. import client_blogger
  7.  
  8.  
  9. class Blogger(Site):
  10.     """ A blogger parser. """
  11.     type = 'blogger'
  12.     type_dir = os.path.join(os.getcwd(), type)
  13.     key = 'MY_API_KEY_HERE'
  14.    
  15.     def __init__(self, user):
  16.         super(Blogger, self).__init__(user)
  17.         self.user_dir = os.path.join(Blogger.type_dir, self.user)
  18.         self.blog_url = user
  19.         self.client = client_blogger.BloggerClient(key=Blogger.key)
  20.        
  21.     def get_all_posts(self):
  22.         """ Retrieves all posts from a blog. Returns list of all text content. """
  23.         blog_id = self.client.get_blog_id(self.blog_url)
  24.         # Get first page of posts:
  25.         page_of_posts = self.client.get_posts(blog_id)
  26.         text_content = [post['content'] for post in page_of_posts['items']]
  27.        
  28.         while not self._reached_last_page(page_of_posts):
  29.             token = page_of_posts.get('nextPageToken')
  30.             # Get next page of posts:
  31.             time.sleep(1)
  32.             page_of_posts = self.client.get_posts(blog_id, pageToken=token)
  33.             # Get text content from page of posts:
  34.             if 'items' in page_of_posts:        # sometimes no items? Ask google.
  35.                 for post in page_of_posts['items']:
  36.                     text_content.append(post['content'])
  37.         all_content = ' '.join(text_content)
  38.         return all_content
  39.    
  40.     def _reached_last_page(self, page_of_posts):
  41.         """ Returns true if there are no more pages of posts left. """
  42.         return 'nextPageToken' not in page_of_posts
  43.        
  44.     def get_media_links(self, all_content):
  45.         """ Accepts string of text content. Returns a set of all media links. """
  46.         # allowed = ('jpg', 'jpeg', 'png', 'gif', 'bmp')
  47.         just_images = SoupStrainer('img')
  48.         soup = BeautifulSoup(all_content, parse_only=just_images)
  49.         links = [img.get('src') for img in soup if img.get('src', None)] # if img.get('src').lower().endswith(allowed)]
  50.         return links
  51.        
  52.        
  53. if __name__ == '__main__':
  54.     pass
Advertisement
Add Comment
Please, Sign In to add comment