Source code for now.data_loading.data_loading

import json
import os
from collections import defaultdict
from typing import Dict, List, Type

from docarray import Document, DocumentArray
from docarray.dataclasses import is_multimodal

from now.common.detect_schema import (
    get_first_file_in_folder_structure_s3,
    get_s3_bucket_and_folder_prefix,
)
from now.constants import MAX_DOCS_FOR_TESTING, DatasetTypes
from now.data_loading.create_dataclass import create_dataclass
from now.data_loading.elasticsearch import ElasticsearchExtractor
from now.log import yaspin_extended
from now.now_dataclasses import UserInput
from now.utils.common.helpers import flatten_dict, sigmap
from now.utils.docarray.helpers import get_chunk_by_field_name


[docs]def load_data(user_input: UserInput, print_callback=print) -> DocumentArray: """Based on the user input, this function will pull the configured DocumentArray dataset ready for the preprocessing executor. :param user_input: The configured user object. Result from the Jina Now cli dialog. :param print_callback: The callback function that should be used to print the status. :return: The loaded DocumentArray. """ da = None if user_input.dataset_type in [DatasetTypes.DEMO, DatasetTypes.DOCARRAY]: user_input.field_names_to_dataclass_fields = { field: field for field in user_input.index_fields } data_class = None else: data_class, user_input.field_names_to_dataclass_fields = create_dataclass( user_input=user_input ) if user_input.dataset_type in [DatasetTypes.DOCARRAY, DatasetTypes.DEMO]: print_callback('⬇ Pull DocumentArray dataset') da = _pull_docarray(user_input.dataset_name, user_input.admin_name) da = _update_fields_and_metadata(da, user_input) elif user_input.dataset_type == DatasetTypes.PATH: print_callback('💿 Loading files from disk') da = _load_from_disk(user_input=user_input, data_class=data_class) elif user_input.dataset_type == DatasetTypes.S3_BUCKET: print_callback('🗄 Loading files from S3') da = _list_files_from_s3_bucket(user_input=user_input, data_class=data_class) elif user_input.dataset_type == DatasetTypes.ELASTICSEARCH: print_callback('🔍 Loading data from Elasticsearch') da = _extract_es_data(user_input=user_input, data_class=data_class) da = set_modality_da(da) _add_metadata_to_chunks(da, user_input) if da is None: raise ValueError( f'Could not load DocumentArray dataset. Please check your configuration: {user_input}.' ) if 'NOW_CI_RUN' in os.environ: da = da[:MAX_DOCS_FOR_TESTING] return da
def _add_metadata_to_chunks(da, user_input): dataclass_fields_to_field_names = { v: k for k, v in user_input.field_names_to_dataclass_fields.items() } for doc in da: for dataclass_field, meta_dict in doc._metadata['multi_modal_schema'].items(): field_name = dataclass_fields_to_field_names.get(dataclass_field, None) if 'position' in meta_dict: get_chunk_by_field_name(doc, dataclass_field)._metadata[ 'field_name' ] = field_name def _update_fields_and_metadata( da: DocumentArray, user_input: UserInput ) -> DocumentArray: """Add selected index fields to da, add the tags, remove non-index chunks, and update multi modal schema.""" if not da: return da all_fields = da[0]._metadata['multi_modal_schema'].keys() for doc in da: filtered_chunks = [] filtered_chunk_names = [] for field in all_fields: field_doc = get_chunk_by_field_name(doc, field) if field not in user_input.index_fields: if field_doc.blob or field_doc.tensor is not None: continue doc.tags.update( { field: field_doc.content if isinstance(field_doc.content, str) else field_doc.uri } ) else: filtered_chunks.append(field_doc) filtered_chunk_names.append(field) doc.chunks = filtered_chunks # keep only the index fields in metadata doc._metadata['multi_modal_schema'] = { field: doc._metadata['multi_modal_schema'][field] for field in filtered_chunk_names } # Update the positions accordingly to access the chunks for position, field in enumerate(filtered_chunk_names): doc._metadata['multi_modal_schema'][field]['position'] = int(position) return da def _pull_docarray(dataset_name: str, admin_name: str) -> DocumentArray: dataset_name = ( admin_name + '/' + dataset_name if '/' not in dataset_name else dataset_name ) try: docs = DocumentArray.pull(name=dataset_name, show_progress=True) if is_multimodal(docs[0]): return docs else: raise ValueError( f'The dataset {dataset_name} does not contain a multimodal DocumentArray. ' f'Please check documentation https://docarray.jina.ai/fundamentals/dataclass/construct/' ) except Exception: raise ValueError( 'DocumentArray does not exist or you do not have access to it. ' 'Make sure to add user name as a prefix. Check documentation here. ' 'https://docarray.jina.ai/fundamentals/cloud-support/data-management/' ) def _extract_es_data(user_input: UserInput, data_class: Type) -> DocumentArray: query = { 'query': {'match_all': {}}, '_source': True, } es_extractor = ElasticsearchExtractor( query=query, index=user_input.es_index_name, user_input=user_input, data_class=data_class, connection_str=user_input.es_host_name, ) extracted_docs = es_extractor.extract() return extracted_docs def _load_from_disk(user_input: UserInput, data_class: Type) -> DocumentArray: """ Loads the data from disk into multimodal documents. :param user_input: The user input object. :param data_class: The dataclass to use for the DocumentArray. """ dataset_path = user_input.dataset_path.strip() dataset_path = os.path.expanduser(dataset_path) if os.path.isfile(dataset_path): try: da = DocumentArray.load_binary(dataset_path) if is_multimodal(da[0]): da = _update_fields_and_metadata(da, user_input) return da else: raise ValueError( f'The file {dataset_path} does not contain a multimodal DocumentArray.' f'Please check documentation https://docarray.jina.ai/fundamentals/dataclass/construct/' ) except Exception: print(f'Failed to load the binary file provided under path {dataset_path}') exit(1) elif os.path.isdir(dataset_path): with yaspin_extended( sigmap=sigmap, text="Loading data from folder", color="green" ) as spinner: spinner.ok('🏭') docs = from_files_local( dataset_path, user_input.index_fields, user_input.field_names_to_dataclass_fields, data_class, ) return docs else: raise ValueError( f'The provided dataset path {dataset_path} does not' f' appear to be a valid file or folder on your system.' )
[docs]def from_files_local( path: str, fields: List[str], field_names_to_dataclass_fields: Dict, data_class: Type, ) -> DocumentArray: """Creates a Multi Modal documentarray over a list of file path or the content of the files. :param path: The path to the directory :param fields: The fields to search for in the directory :param field_names_to_dataclass_fields: The mapping of the field names to the dataclass fields :param data_class: The dataclass to use for the document :return: A DocumentArray with the documents """ file_paths = [] for root, dirs, files in os.walk(path): file_paths.extend( [os.path.join(root, file) for file in files if not file.startswith('.')] ) folder_generator = os.walk(path, topdown=True) current_level = folder_generator.__next__() folder_structure = 'sub_folders' if len(current_level[1]) > 0 else 'single_folder' if folder_structure == 'sub_folders': docs = create_docs_from_subdirectories( file_paths, fields, field_names_to_dataclass_fields, data_class ) else: docs = create_docs_from_files( file_paths, fields, field_names_to_dataclass_fields, data_class ) return DocumentArray(docs)
[docs]def create_docs_from_subdirectories( file_paths: List, fields: List[str], field_names_to_dataclass_fields: Dict, data_class: Type, path: str = None, is_s3_dataset: bool = False, ) -> List[Document]: """ Creates a Multi Modal documentarray over a list of subdirectories. :param file_paths: The list of file paths :param fields: The fields to search for in the directory :param field_names_to_dataclass_fields: The mapping of the field names to the dataclass fields :param data_class: The dataclass to use for the document :param path: The path to the directory :param is_s3_dataset: Whether the dataset is stored on s3 :return: The list of documents """ docs = [] folder_files = defaultdict(list) for file in file_paths: path_to_last_folder = ( '/'.join(file.split('/')[:-1]) if is_s3_dataset else os.sep.join(file.split(os.sep)[:-1]) ) folder_files[path_to_last_folder].append(file) for folder, files in folder_files.items(): kwargs = {} tags_loaded_local = {} _s3_uri_for_tags = '' file_info = [ _extract_file_and_full_file_path(file, path, is_s3_dataset) for file in files ] # first store index fields given as files for file, file_full_path in file_info: if file in fields: kwargs[field_names_to_dataclass_fields[file]] = file_full_path # next check json files that can also contain index fields, and carry on data for file, file_full_path in file_info: if file.endswith('.json'): if is_s3_dataset: _s3_uri_for_tags = file_full_path for field in data_class.__annotations__.keys(): if field not in kwargs.keys(): kwargs[field] = file_full_path else: with open(file_full_path) as f: json_data = flatten_dict(json.load(f)) for field, value in json_data.items(): if field in fields: kwargs[field_names_to_dataclass_fields[field]] = value else: tags_loaded_local[field] = value doc = Document(data_class(**kwargs)) if _s3_uri_for_tags: doc._metadata['_s3_uri_for_tags'] = _s3_uri_for_tags elif tags_loaded_local: doc.tags.update(tags_loaded_local) docs.append(doc) return docs
[docs]def create_docs_from_files( file_paths: List, fields: List[str], field_names_to_dataclass_fields: Dict, data_class: Type, path: str = None, is_s3_dataset: bool = False, ) -> List[Document]: """ Creates a Multi Modal documentarray over a list of files. :param file_paths: List of file paths :param fields: The fields to search for in the directory :param field_names_to_dataclass_fields: The mapping of the files to the dataclass fields :param data_class: The dataclass to use for the document :param path: The path to the directory :param is_s3_dataset: Whether the dataset is stored on s3 :return: A list of documents """ docs = [] for file in file_paths: kwargs = {} file, file_full_path = _extract_file_and_full_file_path( file, path, is_s3_dataset ) file_extension = file.split('.')[-1] if ( file_extension == fields[0].split('.')[-1] ): # fields should have only one index field in case of files only kwargs[field_names_to_dataclass_fields[fields[0]]] = file_full_path docs.append(Document(data_class(**kwargs))) return docs
def _list_s3_file_paths(bucket, folder_prefix): """ Lists the s3 file paths in an optimized way by finding the best level to use concurrent calls on in the file structure, using a threadpool. :param bucket: The s3 bucket used :param folder_prefix: The root folder prefix :return: A list of all s3 paths """ # TODO bucket is not thread safe and outputs duplicate files for different prefixes # first_file = get_first_file_in_folder_structure_s3(bucket, folder_prefix) # structure_identifier = first_file[len(folder_prefix) :].split('/') # folder_structure = ( # 'sub_folders' if len(structure_identifier) > 1 else 'single_folder' # ) # # def get_level_order_prefixes(folder_prefix, level=1): # """ # Gets the list of prefixes in a specific level. Levels are defined by the folder structure as follows: # level_1/level_2/.../level_n/file.ext # # :param folder_prefix: The current level prefix # :param level: The desired level we want to get to # # :return: A list of prefixes # """ # level_prefixes = [ # obj['Prefix'] # for obj in bucket.meta.client.list_objects( # Bucket=bucket.name, Prefix=folder_prefix, Delimiter='/' # )['CommonPrefixes'] # ] # if level == 1: # return level_prefixes # else: # prefix_list = [] # for prefix in level_prefixes: # prefix_list += get_level_order_prefixes(prefix, level - 1) # return prefix_list # # def get_prefixes(max_levels=len(structure_identifier) - 2): # """ # Finds the best level for the prefixes # # :param max_levels: The maximum number of level we can get to, this defaults to len(structure_identifier) - 2 # because the latest level (len(structure_identifier) - 1) will only have files, so it won't have any common # prefixes inside. # # :return: A list of prefixes # """ # level = 1 # list_prefixes = get_level_order_prefixes(folder_prefix, level) # prefixes_states = [list_prefixes] # while level < max_levels and len(list_prefixes) < NUM_FOLDERS_THRESHOLD: # level += 1 # list_prefixes = get_level_order_prefixes(folder_prefix, level) # prefixes_states.append(list_prefixes) # if len(list_prefixes) > NUM_FOLDERS_THRESHOLD and len(prefixes_states) > 1: # return prefixes_states[-2] # return list_prefixes # # objects = [] # if folder_structure == 'sub_folders': # prefixes = get_prefixes() # # TODO: change cpu count to a fixed number # with ThreadPoolExecutor(max_workers=20) as executor: # futures = [] # for prefix in prefixes: # pref = ''.join(prefix) # f = executor.submit( # lambda: list(bucket.objects.filter(Prefix=f'{pref}')) # ) # futures.append(f) # for f in futures: # objects += f.result() # else: # objects = list(bucket.objects.filter(Prefix=folder_prefix)) objects = list(bucket.objects.filter(Prefix=folder_prefix)) return [ obj.key for obj in objects if not obj.key.endswith('/') and not obj.key.split('/')[-1].startswith('.') ] def _list_files_from_s3_bucket( user_input: UserInput, data_class: Type ) -> DocumentArray: """ Loads the data from s3 into multimodal documents. :param user_input: The user input object. :param data_class: The dataclass to use for the DocumentArray. :return: The DocumentArray with the documents. """ bucket, folder_prefix = get_s3_bucket_and_folder_prefix(user_input) first_file = get_first_file_in_folder_structure_s3(bucket, folder_prefix) structure_identifier = first_file[len(folder_prefix) :].split('/') folder_structure = ( 'sub_folders' if len(structure_identifier) > 1 else 'single_folder' ) with yaspin_extended( sigmap=sigmap, text="Listing files from S3 bucket ...", color="green" ) as spinner: file_paths = _list_s3_file_paths(bucket, folder_prefix) spinner.ok('🏭') with yaspin_extended( sigmap=sigmap, text="Creating docarray from S3 bucket files ...", color="green" ) as spinner: if folder_structure == 'sub_folders': docs = create_docs_from_subdirectories( file_paths, user_input.index_fields, user_input.field_names_to_dataclass_fields, data_class, user_input.dataset_path, is_s3_dataset=True, ) else: docs = create_docs_from_files( file_paths, user_input.index_fields, user_input.field_names_to_dataclass_fields, data_class, user_input.dataset_path, is_s3_dataset=True, ) spinner.ok('👝') return DocumentArray(docs) def _extract_file_and_full_file_path(file_path, path=None, is_s3_dataset=False): """ Extracts the file name and the full file path from s3 object. :param file_path: The file path :param path: The path to the directory :param is_s3_dataset: Whether the dataset is stored on s3 :return: The file name and the full file path """ if is_s3_dataset: file = file_path.split('/')[-1] file_full_path = '/'.join(path.split('/')[:3]) + '/' + file_path else: file_full_path = file_path file = file_path.split(os.sep)[-1] return file, file_full_path def _get_modality(document: Document): """ Detect document's modality based on its `modality` or `mime_type` attributes. :param document: The document to detect the modality for. """ modalities = ['text', 'image', 'video'] if document.modality: return document.modality mime_type_class = document.mime_type.split('/')[0] if document.mime_type == 'application/json': return 'text' if mime_type_class in modalities: return mime_type_class document.summary() raise ValueError(f'Unknown modality')
[docs]def set_modality_da(documents: DocumentArray): """ Set document's modality based on its `modality` or `mime_type` attributes. :param documents: The DocumentArray to set the modality for. :return: The DocumentArray with the modality set. """ for doc in documents: for chunk in doc.chunks: chunk.modality = chunk.modality or _get_modality(chunk) return documents