Module files
Sync client for files workflow.
upload
def upload(paths: List[Path],
api_key: Optional[str] = None,
api_url: Optional[str] = None,
workspace_name: str = DEFAULT_WORKSPACE_NAME,
write_mode: WriteMode = WriteMode.KEEP,
blocking: bool = True,
timeout_s: Optional[int] = None,
show_progress: bool = True,
recursive: bool = False,
desired_file_types: Optional[List[str]] = None,
enable_parallel_processing: bool = False,
safe_mode: bool = False) -> S3UploadSummary
Upload a folder to deepset Cloud.
Arguments:
paths
: Path to the folder to upload. If the folder contains unsupported file types, they're skipped. deepset Cloud supports csv, docx, html, json, md, txt, pdf, pptx, xlsx, xml.api_key
: deepset Cloud API key to use for authentication.api_url
: API URL to use for authentication.workspace_name
: Name of the workspace to upload the files to. It uses the workspace from the .ENV file by default.write_mode
: Specifies what to do when a file with the same name already exists in the workspace. Possible options are: KEEP - uploads the file with the same name and keeps both files in the workspace. OVERWRITE - overwrites the file that is in the workspace. FAIL - fails to upload the file with the same name.blocking
: Whether to wait for the files to be uploaded and displayed in deepset Cloud.timeout_s
: Timeout in seconds for theblocking
parameter.show_progress
: Shows the upload progress.recursive
: Uploads files from subfolders as well.desired_file_types
: A list of allowed file types to upload, defaults to[".txt", ".pdf", ".docx", ".pptx", ".xlsx", ".xml", ".csv", ".html", ".md", ".json"]
enable_parallel_processing
: IfTrue
, deepset Cloud ingests files in parallel. Use this to speed up the upload process. Make sure you are not running concurrent uploads for the same files.safe_mode
: IfTrue
, disables ingesting files in parallel.
download
def download(workspace_name: str = DEFAULT_WORKSPACE_NAME,
file_dir: Optional[Union[Path, str]] = None,
name: Optional[str] = None,
odata_filter: Optional[str] = None,
include_meta: bool = True,
batch_size: int = 50,
api_key: Optional[str] = None,
api_url: Optional[str] = None,
show_progress: bool = True,
timeout_s: Optional[int] = None,
safe_mode: bool = False) -> None
Download a folder to deepset Cloud.
Downloads all files from a workspace to a local folder.
Arguments:
workspace_name
: Name of the workspace to upload the files to. It uses the workspace from the .ENV file by default.file_dir
: Path to the folder to download.name
: Name of the file to odata_filter by.odata_filter
: odata_filter by file meta data.include_meta
: Whether to include the file meta in the folder.batch_size
: Batch size for the listing.api_key
: API key to use for authentication.api_url
: API URL to use for authentication.show_progress
: Shows the upload progress.timeout_s
: Timeout in seconds for the API requests.safe_mode
: IfTrue
, disables ingesting files in parallel.
upload_texts
def upload_texts(files: List[DeepsetCloudFile],
api_key: Optional[str] = None,
api_url: Optional[str] = None,
workspace_name: str = DEFAULT_WORKSPACE_NAME,
write_mode: WriteMode = WriteMode.KEEP,
blocking: bool = True,
timeout_s: Optional[int] = None,
show_progress: bool = True,
enable_parallel_processing: bool = False) -> S3UploadSummary
Upload texts to deepset Cloud.
Arguments:
files
: List of DeepsetCloudFiles to upload.api_key
: deepset Cloud API key to use for authentication.api_url
: API URL to use for authentication.workspace_name
: Name of the workspace to upload the files to. It uses the workspace from the .ENV file by default.write_mode
: Specifies what to do when a file with the same name already exists in the workspace. Possible options are: KEEP - uploads the file with the same name and keeps both files in the workspace. OVERWRITE - overwrites the file that is in the workspace. FAIL - fails to upload the file with the same name.blocking
: Whether to wait for the files to be uploaded and listed in deepset Cloud.timeout_s
: Timeout in seconds for theblocking
parameter.show_progress
: Shows the upload progress.enable_parallel_processing
: IfTrue
, deepset Cloud ingests files in parallel. Use this to speed up the upload process. Make sure you are not running concurrent uploads for the same files.
Example:
from deepset_cloud_sdk.workflows.sync_client.files import upload_texts, DeepsetCloudFile
upload_texts(
api_key="<deepsetCloud_API_key>",
workspace_name="<default_workspace>", # optional, by default the environment variable "DEFAULT_WORKSPACE_NAME" is used
files=[
DeepsetCloudFile(
name="example.txt",
text="this is text",
meta={"key": "value"}, # optional
)
],
blocking=True, # optional, by default True
timeout_s=300, # optional, by default 300
)
upload_bytes
def upload_bytes(files: List[DeepsetCloudFileBytes],
api_key: Optional[str] = None,
api_url: Optional[str] = None,
workspace_name: str = DEFAULT_WORKSPACE_NAME,
write_mode: WriteMode = WriteMode.KEEP,
blocking: bool = True,
timeout_s: Optional[int] = None,
show_progress: bool = True,
enable_parallel_processing: bool = False) -> S3UploadSummary
Upload any supported file types to deepset Cloud. These include .csv, .docx, .html, .json, .md, .txt, .pdf, .pptx, .xlsx and .xml.
Arguments:
files
: List of DeepsetCloudFilesBytes to upload.api_key
: deepset Cloud API key to use for authentication.api_url
: API URL to use for authentication.workspace_name
: Name of the workspace to upload the files to. It uses the workspace from the .ENV file by default.write_mode
: Specifies what to do when a file with the same name already exists in the workspace. Possible options are: KEEP - uploads the file with the same name and keeps both files in the workspace. OVERWRITE - overwrites the file that is in the workspace. FAIL - fails to upload the file with the same name.blocking
: Whether to wait for the files to be uploaded and listed in deepset Cloud.timeout_s
: Timeout in seconds for theblocking
parameter.show_progress
: Shows the upload progress.enable_parallel_processing
: IfTrue
, deepset Cloud ingests files in parallel. Use this to speed up the upload process. Make sure you are not running concurrent uploads for the same files.
get_upload_session
def get_upload_session(
session_id: UUID,
api_key: Optional[str] = None,
api_url: Optional[str] = None,
workspace_name: str = DEFAULT_WORKSPACE_NAME) -> UploadSessionStatus
Get the status of an upload session.
Arguments:
session_id
: ID of the upload session to get the status for.api_key
: deepset Cloud API key to use for authentication.api_url
: API URL to use for authentication.workspace_name
: Name of the workspace to upload the files to.
list_files
def list_files(
api_key: Optional[str] = None,
api_url: Optional[str] = None,
workspace_name: str = DEFAULT_WORKSPACE_NAME,
name: Optional[str] = None,
odata_filter: Optional[str] = None,
batch_size: int = 100,
timeout_s: Optional[int] = None) -> Generator[List[File], None, None]
List files in a deepset Cloud workspace.
Arguments:
api_key
: deepset Cloud API key to use for authentication.api_url
: API URL to use for authentication.workspace_name
: Name of the workspace to list the files from. It uses the workspace from the .ENV file by default.name
: Name of the file to odata_filter for.odata_filter
: odata_filter to apply to the file list. For example,odata_filter="category eq 'news'" lists files with metadata
{"meta": {"category": "news"}}.batch_size
: Batch size to use for the file list.timeout_s
: Timeout in seconds for the API requests.
list_upload_sessions
def list_upload_sessions(
api_key: Optional[str] = None,
api_url: Optional[str] = None,
workspace_name: str = DEFAULT_WORKSPACE_NAME,
is_expired: Optional[bool] = False,
batch_size: int = 100,
timeout_s: Optional[int] = None
) -> Generator[List[UploadSessionDetail], None, None]
List the details of all upload sessions, including the closed ones.
Arguments:
api_key
: deepset Cloud API key to use for authentication.api_url
: API URL to use for authentication.workspace_name
: Name of the workspace whose sessions you want to list. It uses the workspace from the .ENV file by default.is_expired
: Lists expired sessions.batch_size
: Batch size to use for the session list.timeout_s
: Timeout in seconds for the API request.