Downloading JRA-3Q efficiently using a python script

Question

The JRA-3Q dataset is available from DIAS. There is an online interface to download what you need file-by-file.

Other reanalyses like ERA5 and MERRA2 have access points that let you write a python script to get what you want (and often subset it on their servers using opendap or similar).

I haven't been able to find an efficient way of downloading the JRA-3Q dataset in such a way. Part of this is the file nomenclature that includes an obscure-looking string (e.g.

https://data.diasjp.net/dl/storages/file/L0pSQTNRL0NsaW05MTIwL0RhaWx5L2FubF9pc2VudHJvcC9hbmxfaXNlbnRyb3BfYnZmMi5jbGltOTEyMC5kYXkwMTA3

Has anyone come up with a way to download quite a lot of data in an efficient way?

alex_danielssen · Accepted Answer · 2023-08-15 21:12:33Z

The URL strings are base64-encoded, the example given decodes to: /JRA3Q/Client9120/Daily/anl_isentrop/anl_isentrop_bvf2.cglt9103.day0107

With this information, you can pretty easily figure out, for a given dataset, which parts of the string you need to replace. Then, use the provided download script:

#!/usr/bin/env python import urllib import urllib2 import urlparse import cookielib import HTMLParser import subprocess import sys import os import optparse import netrc import getpass class CASLoginParser(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self.action = None self.data = {} def handle_starttag(self, tagname, attribute): if tagname.lower() == 'form': attribute = dict(attribute) if 'action' in attribute: self.action = attribute['action'] elif tagname.lower() == 'input': attribute = dict(attribute) if 'name' in attribute and 'value' in attribute: self.data[attribute['name']] = attribute['value'] class DIASAccess(): def __init__(self, username, password): self.__cas_url = 'https://auth.diasjp.net/cas/login?' self.__username = username self.__password = password #self.__cj = cookielib.CookieJar() self.__cj = cookielib.MozillaCookieJar() self.__opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(self.__cj)) def open(self, url, data=None): response = self.__opener.open(url, data) response_url = response.geturl() if response_url != url and response_url.startswith(self.__cas_url): # redirected to CAS login page response = self.__login_cas(response) if data != None: # If POST (data != None), need reopen response.close() response = self.__opener.open(url, data) return response def __login_cas(self, response): parser = CASLoginParser() parser.feed(response.read()) parser.close() if parser.action == None: raise LoginError('Not login page') action_url = urlparse.urljoin(response.geturl(), parser.action) data = parser.data data['username'] = self.__username data['password'] = self.__password response.close() response = self.__opener.open(action_url, urllib.urlencode(data)) if response.geturl() == action_url: print 'Authorization fail' quit() return response def dl(self, url, path, file, data=None): try: response = self.__opener.open(url, data) if not os.path.exists('.' + path): os.makedirs('.' + path) f = open('.' + path + file, 'wb') file_size_dl = 0 block_size = 8192 while True: buffer = response.read(block_size) if not buffer: break file_size_dl += len(buffer) f.write(buffer) f.close print path + file + " OK" return response except urllib2.HTTPError,e: print path + file + " NG" class LoginError(Exception): def __init__(self, e): Exception.__init__(self, e) if __name__ == '__main__': host = 'data.diasjp.net' usage ='''usage: %prog [options]''' parser = optparse.OptionParser(usage=usage) parser.add_option('-n', '--netrc', default=None, help='specify the netrc file', metavar='FILE') parser.add_option('-u', '--user', default=None, help='specify the DIAS account name', metavar='USERNAME') (options, args) = parser.parse_args() (login, password) = (None, None) try: auth = netrc.netrc(options.netrc).authenticators(host) if auth is not None: (login, account, password) = auth except (IOError): pass if options.user is not None: login = options.user password = None if login is None: login = raw_input('Username: ') if password is None: password = getpass.getpass('Password: ') access = DIASAccess(login, password) targeturl='https://data.diasjp.net/dl/storages/filelist/dataset:645' response = access.open(targeturl) response.close() access.dl('https://data.diasjp.net/dl/storages/downloadCmd/L0pSQTNRL0NsaW05MTIwL0RhaWx5L2FubF9pc2VudHJvcC9hbmxfaXNlbnRyb3BfYnZmMi5jbGltOTEyMC5kYXkwMTAx', '/JRA3Q/Clim9120/Daily/anl_isentrop/', 'anl_isentrop_bvf2.clim9120.day0101')

and replace the parameters in access.dl appropriately.

Great work ! I was searching for something similar online. Can you actually download all the variables - zonal wind, meridional wind etc ? Do you have a Jupyter notebook on a github site possibly that I can run ? — gansub
– gansub, Commented Aug 16, 2023 at 1:57
If you can share me your github site I can suggest enhancements to this code by allowing for pressure level downloads, surface level parameters etc and we can do a joint release. — gansub
– gansub, Commented Aug 16, 2023 at 10:39

Stack Exchange Network

Downloading JRA-3Q efficiently using a python script

1 Answer 1

Hot Network Questions

Downloading JRA-3Q efficiently using a python script

1 Answer 1

Related

Hot Network Questions