Source code for scrape.wood_database

#! /usr/bin/env python
'''
contains the WoodDatabaseScraper class used for scraping wood-database.com
'''
from __future__ import division, with_statement, print_function
from itertools import *
import os
import re
import urllib
from bs4 import BeautifulSoup
import requests
import pandas as pd
from pandas import DataFrame, Series
# ------------------------------------------------------------------------------

[docs]class WoodDatabaseScraper(object): def __init__(self): ''' scrapes wood-database.com for images and image descriptions ''' self._url = 'http://www.wood-database.com'
[docs] def get_wood_urls(self): ''' yields links of each different wood page Yields: str: url ''' response = requests.get(self._url).content soup = BeautifulSoup(response) x = soup.select('.fusion-column-wrapper') x = list(chain(*[x.select('a') for x in x])) x = x[27:604] for item in x: try: yield item.attrs['href'] except: pass
[docs] def write_images(self, root, links, prefix=None): ''' writes all images to a given root directory Args: root (str): root directory links (list): list of image links provided by get_image_links prefix opt(str): filename prefix default: None Returns: None: None ''' for link in links: filename = link.split('/')[-1] if prefix: filename = prefix + filename fullpath = os.path.join(root, filename) urllib.urlretrieve(link, fullpath)
[docs] def scrape_images(self, root, prefix=None): ''' scrapes and saves all texture images from wood-database.com Args: root (str): directory to save images in prefix opt(str): filename prefix default: None ''' urls = self.get_wood_urls() links = self.get_image_links(urls) self.write_images(root, links, prefix=prefix)
def _clean_description(self, element): ''' cleans up aggregated descriptions ''' data = [x.getText() for x in element] data = Series(data) data = data.apply(lambda x: re.sub('\n', ' ', x)) mask = data.apply(lambda x: False if re.search('\(sanded|\(sealed|\(endgrain|\(curl|\(burl|^$', x) else True) data = data[mask] def func(item): try: return list(re.search('(.*?):(.*)', item).groups()) except: return [item, None] data = data.apply(func).tolist() data = DataFrame(data, columns=['heading', 'content']) mask = data.content.apply(lambda x: pd.notnull(x)) if mask.shape[0] > 0: mask.ix[0] = True data = data[mask] return data
[docs] def get_descriptions(self, urls): ''' gets description data for each wood type Args: urls (iterable): links generated by get_wood_urls Returns: dict: dict of lists ''' data = {} for url in urls: response = requests.get(url).content soup = BeautifulSoup(response) desc1 = soup.select('.post-content table tbody tr td p') desc2 = soup.select('.post-content blockquote') desc1 = self._clean_description((desc1)) desc2 = self._clean_description((desc2)) datum = pd.concat([desc1, desc2], axis=0) datum = datum.apply(lambda x: x.to_dict(), axis=1).tolist() name = re.split('/', url)[-2] data[name] = datum return data
# ------------------------------------------------------------------------------ __all__ = [ 'WoodDatabaseScraper', ] def main(): pass if __name__ == '__main__': help(main)