"""Downloading functions."""# Created by Wenjie Du <wenjay.du@gmail.com># License: BSD-3-ClauseimportgzipimportosimportshutilimporttempfileimportwarningsfromtypingimportOptionalimportrequestsfromtqdmimporttqdmfrom.loggingimportloggerfrom..databaseimportDATABASEdef_download_and_extract(url:str,saving_path:str)->Optional[str]:"""Download dataset from the given url and extract to the given saving path. Parameters ---------- url : str, URL of the dataset to be downloaded. saving_path : str, Path to save extracted dataset. Returns ------- saving_path if successful else None """no_need_decompression_format=["csv","txt"]supported_compression_format=["zip","tar","gz","bz","xz"]# truncate the file name from urlfile_name=os.path.basename(url)suffix=file_name.split(".")[-1]ifsuffixinno_need_decompression_format:raw_data_saving_path=os.path.join(saving_path,file_name)elifsuffixinsupported_compression_format:# create temp dir for raw data savingtmp_dir=tempfile.mkdtemp()raw_data_saving_path=os.path.join(tmp_dir,file_name)else:warnings.warn("The compression format is not supported, aborting. ""If necessary, please create a pull request to add according supports.",category=RuntimeWarning,)returnNone# download and save the raw datasettry:withrequests.get(url,stream=True)asr:r.raise_for_status()chunk_size=8192try:size=int(r.headers["Content-Length"])exceptKeyError:size=Nonewithtqdm(unit="B",unit_scale=True,unit_divisor=1024,miniters=1,desc=f"Downloading {file_name}",total=size,)aspbar:withopen(raw_data_saving_path,"wb")asf:forchunkinr.iter_content(chunk_size=chunk_size):f.write(chunk)pbar.update(len(chunk))exceptExceptionase:shutil.rmtree(saving_path,ignore_errors=True)shutil.rmtree(raw_data_saving_path,ignore_errors=True)raiseRuntimeError(f"Exception: {e}\n"f"Download failed. Aborting.")exceptKeyboardInterrupt:shutil.rmtree(saving_path,ignore_errors=True)shutil.rmtree(raw_data_saving_path,ignore_errors=True)raiseKeyboardInterrupt("Download cancelled by the user.")logger.info(f"Successfully downloaded data to {raw_data_saving_path}")# if the file is compressed, then unpack itifsuffixinsupported_compression_format:try:os.makedirs(saving_path,exist_ok=True)if".txt.gz"infile_name:new_name=file_name.split(".txt.gz")[0]new_name=new_name+".txt"saving_path=os.path.join(saving_path,new_name)withopen(raw_data_saving_path,"rb")asf,open(saving_path,"wb")aswf:wf.write(gzip.decompress(f.read()))else:shutil.unpack_archive(raw_data_saving_path,saving_path)logger.info(f"Successfully extracted data to {saving_path}")exceptExceptionase:shutil.rmtree(saving_path,ignore_errors=True)raiseRuntimeError(f"❌ {e}")finally:shutil.rmtree(tmp_dir,ignore_errors=True)returnsaving_path
[docs]defdownload_and_extract(dataset_name:str,dataset_saving_path:str)->None:"""Wrapper of _download_and_extract. Parameters ---------- dataset_name : str, The name of a dataset available in tsdb. dataset_saving_path : str, The local path for dataset saving. """logger.info("Start downloading...")os.makedirs(dataset_saving_path)dataset_url=DATABASE[dataset_name]ifisinstance(dataset_url,str)anddataset_url.startswith("hf://"):# HuggingFace datasets are downloaded during loading via the datasets librarylogger.info(f"Dataset {dataset_name} is hosted on HuggingFace. "f"It will be downloaded on the first call to the dataset loader.")returnifisinstance(dataset_url,list):forlinkindataset_url:_download_and_extract(link,dataset_saving_path)else:_download_and_extract(dataset_url,dataset_saving_path)