Skip to content

DataSourceDownloader

DataSourceDownloader

Source code in ddataflow/downloader.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
class DataSourceDownloader:

    _download_folder: str

    def download_all(
        self, data_sources: DataSources, overwrite: bool = True, debug=False
    ):
        """
        Download the data sources locally for development offline
        Note: you need databricks-cli for this command to work

        Options:
            overwrite: will first clean the existing files
        """
        self._download_folder = data_sources.download_folder
        if overwrite:
            if ".ddataflow" not in self._download_folder:
                raise Exception("Can only clean folders within .ddataflow")

            cmd_delete = f"rm -rf {self._download_folder}"
            print("Deleting content from", cmd_delete)
            os.system(cmd_delete)

        print("Starting to download the data-sources into your snapshot folder")

        for data_source_name in data_sources.all_data_sources_names():
            print(f"Starting download process for datasource: {data_source_name}")
            data_source = data_sources.get_data_source(data_source_name)
            self._download_data_source(data_source, debug)

        print("Download of all data-sources finished successfully!")

    def _download_data_source(self, data_source: DataSource, debug=False):
        """
        Download the latest data snapshot to the local machine for developing locally
        """
        os.makedirs(self._download_folder, exist_ok=True)

        debug_str = ""
        if debug:
            debug_str = "--debug"

        cmd = f'databricks fs cp {debug_str} -r "{data_source.get_dbfs_sample_path()}" "{data_source.get_local_path()}"'

        logger.info(cmd)
        result = os.system(cmd)

        if result != 0:
            raise Exception(
                f"""
            Databricks cli failed! See error message above.
            Also consider rerunning the download command in your terminal to see the results.
            {cmd}
            """
            )

download_all(data_sources, overwrite=True, debug=False)

Download the data sources locally for development offline Note: you need databricks-cli for this command to work

Options: overwrite: will first clean the existing files

Source code in ddataflow/downloader.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def download_all(
    self, data_sources: DataSources, overwrite: bool = True, debug=False
):
    """
    Download the data sources locally for development offline
    Note: you need databricks-cli for this command to work

    Options:
        overwrite: will first clean the existing files
    """
    self._download_folder = data_sources.download_folder
    if overwrite:
        if ".ddataflow" not in self._download_folder:
            raise Exception("Can only clean folders within .ddataflow")

        cmd_delete = f"rm -rf {self._download_folder}"
        print("Deleting content from", cmd_delete)
        os.system(cmd_delete)

    print("Starting to download the data-sources into your snapshot folder")

    for data_source_name in data_sources.all_data_sources_names():
        print(f"Starting download process for datasource: {data_source_name}")
        data_source = data_sources.get_data_source(data_source_name)
        self._download_data_source(data_source, debug)

    print("Download of all data-sources finished successfully!")