Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
- import os
- import time
- from bs4 import BeautifulSoup, SoupStrainer
- from site_base import Site
- import client_blogger
- class Blogger(Site):
- """ A blogger parser. """
- type = 'blogger'
- type_dir = os.path.join(os.getcwd(), type)
- key = 'MY_API_KEY_HERE'
- def __init__(self, user):
- super(Blogger, self).__init__(user)
- self.user_dir = os.path.join(Blogger.type_dir, self.user)
- self.blog_url = user
- self.client = client_blogger.BloggerClient(key=Blogger.key)
- def get_all_posts(self):
- """ Retrieves all posts from a blog. Returns list of all text content. """
- blog_id = self.client.get_blog_id(self.blog_url)
- # Get first page of posts:
- page_of_posts = self.client.get_posts(blog_id)
- text_content = [post['content'] for post in page_of_posts['items']]
- while not self._reached_last_page(page_of_posts):
- token = page_of_posts.get('nextPageToken')
- # Get next page of posts:
- time.sleep(1)
- page_of_posts = self.client.get_posts(blog_id, pageToken=token)
- # Get text content from page of posts:
- if 'items' in page_of_posts: # sometimes no items? Ask google.
- for post in page_of_posts['items']:
- text_content.append(post['content'])
- all_content = ' '.join(text_content)
- return all_content
- def _reached_last_page(self, page_of_posts):
- """ Returns true if there are no more pages of posts left. """
- return 'nextPageToken' not in page_of_posts
- def get_media_links(self, all_content):
- """ Accepts string of text content. Returns a set of all media links. """
- # allowed = ('jpg', 'jpeg', 'png', 'gif', 'bmp')
- just_images = SoupStrainer('img')
- soup = BeautifulSoup(all_content, parse_only=just_images)
- links = [img.get('src') for img in soup if img.get('src', None)] # if img.get('src').lower().endswith(allowed)]
- return links
- if __name__ == '__main__':
- pass
Advertisement
Add Comment
Please, Sign In to add comment