{"openapi":"3.1.0","info":{"title":"FishAudio OpenAPI","version":"1"},"paths":{"/wallet/{user_id}/package":{"get":{"summary":"Get User Package","security":[{"BearerAuth":[]}],"parameters":[{"in":"path","name":"user_id","description":"User ID or 'self'","required":false,"schema":{"default":"self","title":"User Id","type":"string"},"deprecated":false}],"responses":{"200":{"description":"Request fulfilled, document follows","headers":{},"content":{"application/json":{"schema":{"properties":{"user_id":{"title":"User Id","type":"string"},"type":{"title":"Type","type":"string"},"total":{"title":"Total","type":"integer"},"balance":{"title":"Balance","type":"integer"},"created_at":{"format":"date-time","title":"Created At","type":"string"},"updated_at":{"format":"date-time","title":"Updated At","type":"string"},"finished_at":{"format":"date-time","title":"Finished At","type":"string"},"stripe_subscription_id":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Stripe Subscription Id"},"stripe_price_id":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Stripe Price Id"},"billing_period":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Billing Period"},"current_period_end":{"anyOf":[{"format":"date-time","type":"string"},{"type":"null"}],"default":null,"title":"Current Period End"},"cancel_at_period_end":{"anyOf":[{"type":"boolean"},{"type":"null"}],"default":null,"title":"Cancel At Period End"},"cancel_at":{"anyOf":[{"format":"date-time","type":"string"},{"type":"null"}],"default":null,"title":"Cancel At"},"scheduled_change":{"anyOf":[{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"title":"Scheduled Change"},"last_synced_at":{"anyOf":[{"format":"date-time","type":"string"},{"type":"null"}],"default":null,"title":"Last Synced At"},"extra_balance":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":0,"title":"Extra Balance"},"has_used_trial":{"default":false,"title":"Has Used Trial","type":"boolean"}},"required":["user_id","type","total","balance","created_at","updated_at","finished_at"],"type":"object"}}}},"401":{"description":"No permission -- see authorization schemes","headers":{},"content":{"application/json":{"schema":{"properties":{"status":{"title":"Status","type":"integer"},"message":{"title":"Message","type":"string"}},"required":["status","message"],"type":"object"}}}},"422":{"description":"","headers":{},"content":{"application/json":{"schema":{"type":"array","items":{"type":"object","properties":{"loc":{"title":"Location","description":"error field","type":"array","items":{"type":"string"}},"type":{"title":"Type","description":"error type","type":"string"},"msg":{"title":"Message","description":"error message","type":"string"},"ctx":{"title":"Context","description":"error context","type":"string"},"in":{"title":"In","type":"string","enum":["path","query","header","cookie","body"]}},"required":["loc","type","msg"]}}}}}},"tags":["Wallet"]}},"/wallet/{user_id}/api-credit":{"get":{"summary":"Get API Credit","security":[{"BearerAuth":[]}],"parameters":[{"in":"query","name":"check_free_credit","description":"","required":false,"schema":{"default":false,"title":"Check Free Credit","type":"boolean"},"deprecated":false},{"in":"query","name":"team_id","description":"","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Team Id"},"deprecated":false},{"in":"path","name":"user_id","description":"User ID or 'self'","required":false,"schema":{"default":"self","title":"User Id","type":"string"},"deprecated":false}],"responses":{"200":{"description":"Request fulfilled, document follows","headers":{},"content":{"application/json":{"schema":{"properties":{"_id":{"title":"Id","type":"string"},"user_id":{"title":"User Id","type":"string"},"credit":{"title":"Credit","type":"string"},"created_at":{"format":"date-time","title":"Created At","type":"string"},"updated_at":{"format":"date-time","title":"Updated At","type":"string"},"has_phone_sha256":{"title":"Has Phone Sha256","type":"boolean"},"has_free_credit":{"anyOf":[{"type":"boolean"},{"type":"null"}],"default":null,"title":"Has Free Credit"}},"required":["_id","user_id","credit","created_at","updated_at","has_phone_sha256"],"type":"object"}}}},"401":{"description":"No permission -- see authorization schemes","headers":{},"content":{"application/json":{"schema":{"properties":{"status":{"title":"Status","type":"integer"},"message":{"title":"Message","type":"string"}},"required":["status","message"],"type":"object"}}}},"422":{"description":"","headers":{},"content":{"application/json":{"schema":{"type":"array","items":{"type":"object","properties":{"loc":{"title":"Location","description":"error field","type":"array","items":{"type":"string"}},"type":{"title":"Type","description":"error type","type":"string"},"msg":{"title":"Message","description":"error message","type":"string"},"ctx":{"title":"Context","description":"error context","type":"string"},"in":{"title":"In","type":"string","enum":["path","query","header","cookie","body"]}},"required":["loc","type","msg"]}}}}}},"tags":["Wallet"]}},"/model":{"get":{"summary":"List Models","security":[{"BearerAuth":[]}],"parameters":[{"in":"query","name":"page_size","description":"Page size","required":false,"schema":{"default":10,"minimum":1,"title":"Page Size","type":"integer"},"deprecated":false},{"in":"query","name":"page_number","description":"Page number","required":false,"schema":{"default":1,"minimum":1,"title":"Page Number","type":"integer"},"deprecated":false},{"in":"query","name":"title","description":"Title to filter models","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Title"},"deprecated":false},{"in":"query","name":"tag","description":"Tag to filter models","required":false,"schema":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"},{"type":"null"}],"default":null,"title":"Tag"},"deprecated":false},{"in":"query","name":"self","description":"If True, only models created by the user will be returned","required":false,"schema":{"default":false,"title":"Self","type":"boolean"},"deprecated":false},{"in":"query","name":"author_id","description":"Author ID to filter models, this will be ignored if self is True","required":false,"schema":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Author Id"},"deprecated":false},{"in":"query","name":"language","description":"Language to filter models","required":false,"schema":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"},{"type":"null"}],"default":null,"title":"Language"},"deprecated":false},{"in":"query","name":"title_language","description":"Title language to filter models","required":false,"schema":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"},{"type":"null"}],"default":null,"title":"Title Language"},"deprecated":false},{"in":"query","name":"sort_by","description":"","required":false,"schema":{"default":"score","enum":["score","task_count","created_at"],"title":"Sort By","type":"string"},"deprecated":false}],"responses":{"200":{"description":"Request fulfilled, document follows","headers":{},"content":{"application/json":{"schema":{"properties":{"total":{"title":"Total","type":"integer"},"items":{"items":{"$ref":"#/components/schemas/ModelEntity"},"title":"Items","type":"array"},"has_more":{"anyOf":[{"type":"boolean"},{"type":"null"}],"default":null,"title":"Has More"}},"required":["total","items"],"type":"object"}}}},"422":{"description":"","headers":{},"content":{"application/json":{"schema":{"type":"array","items":{"type":"object","properties":{"loc":{"title":"Location","description":"error field","type":"array","items":{"type":"string"}},"type":{"title":"Type","description":"error type","type":"string"},"msg":{"title":"Message","description":"error message","type":"string"},"ctx":{"title":"Context","description":"error context","type":"string"},"in":{"title":"In","type":"string","enum":["path","query","header","cookie","body"]}},"required":["loc","type","msg"]}}}}}},"tags":["Model"]},"post":{"summary":"Create Model for Users via API","security":[{"BearerAuth":[]}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"properties":{"visibility":{"default":"public","description":"Model visibility, public will be shown in the discovery page, unlist allows anyone with the link to access, private only be visible to the creator","enum":["public","unlist","private"],"title":"Visibility","type":"string"},"type":{"const":"tts","description":"Model type, tts is for text to speech","title":"Type","type":"string"},"title":{"description":"Model title or name","title":"Title","type":"string"},"description":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Model description","title":"Description"},"cover_image":{"anyOf":[{"format":"binary","type":"string"},{"type":"null"}],"default":null,"description":"Model cover image, this is required if the model is public","title":"Cover Image"},"train_mode":{"const":"fast","description":"Model train mode, for TTS model, fast means model instantly available after creation","title":"Train Mode","type":"string"},"voices":{"anyOf":[{"items":{"format":"binary","type":"string"},"type":"array"},{"format":"binary","type":"string"}],"description":"Upload voices files that will be used to tune the model","title":"Voices"},"texts":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"},{"type":"null"}],"default":null,"description":"Texts corresponding to the voices, if unspecified, ASR will be performed on the voices","title":"Texts"},"tags":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"},{"type":"null"}],"description":"Model tags","title":"Tags"},"enhance_audio_quality":{"default":true,"description":"Enhance audio quality","title":"Enhance Audio Quality","type":"boolean"},"generate_sample":{"default":false,"description":"Generate default text","title":"Generate Sample","type":"boolean"}},"required":["type","title","train_mode","voices"],"type":"object"}},"application/x-www-form-urlencoded":{"schema":{"properties":{"visibility":{"default":"public","description":"Model visibility, public will be shown in the discovery page, unlist allows anyone with the link to access, private only be visible to the creator","enum":["public","unlist","private"],"title":"Visibility","type":"string"},"type":{"const":"tts","description":"Model type, tts is for text to speech","title":"Type","type":"string"},"title":{"description":"Model title or name","title":"Title","type":"string"},"description":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Model description","title":"Description"},"cover_image":{"anyOf":[{"format":"binary","type":"string"},{"type":"null"}],"default":null,"description":"Model cover image, this is required if the model is public","title":"Cover Image"},"train_mode":{"const":"fast","description":"Model train mode, for TTS model, fast means model instantly available after creation","title":"Train Mode","type":"string"},"voices":{"anyOf":[{"items":{"format":"binary","type":"string"},"type":"array"},{"format":"binary","type":"string"}],"description":"Upload voices files that will be used to tune the model","title":"Voices"},"texts":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"},{"type":"null"}],"default":null,"description":"Texts corresponding to the voices, if unspecified, ASR will be performed on the voices","title":"Texts"},"tags":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"},{"type":"null"}],"description":"Model tags","title":"Tags"},"enhance_audio_quality":{"default":true,"description":"Enhance audio quality","title":"Enhance Audio Quality","type":"boolean"},"generate_sample":{"default":false,"description":"Generate default text","title":"Generate Sample","type":"boolean"}},"required":["type","title","train_mode","voices"],"type":"object"}},"multipart/form-data":{"schema":{"properties":{"visibility":{"default":"public","description":"Model visibility, public will be shown in the discovery page, unlist allows anyone with the link to access, private only be visible to the creator","enum":["public","unlist","private"],"title":"Visibility","type":"string"},"type":{"const":"tts","description":"Model type, tts is for text to speech","title":"Type","type":"string"},"title":{"description":"Model title or name","title":"Title","type":"string"},"description":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Model description","title":"Description"},"cover_image":{"anyOf":[{"format":"binary","type":"string"},{"type":"null"}],"default":null,"description":"Model cover image, this is required if the model is public","title":"Cover Image"},"train_mode":{"const":"fast","description":"Model train mode, for TTS model, fast means model instantly available after creation","title":"Train Mode","type":"string"},"voices":{"anyOf":[{"items":{"format":"binary","type":"string"},"type":"array"},{"format":"binary","type":"string"}],"description":"Upload voices files that will be used to tune the model","title":"Voices"},"texts":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"},{"type":"null"}],"default":null,"description":"Texts corresponding to the voices, if unspecified, ASR will be performed on the voices","title":"Texts"},"tags":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"},{"type":"null"}],"description":"Model tags","title":"Tags"},"enhance_audio_quality":{"default":true,"description":"Enhance audio quality","title":"Enhance Audio Quality","type":"boolean"},"generate_sample":{"default":false,"description":"Generate default text","title":"Generate Sample","type":"boolean"}},"required":["type","title","train_mode","voices"],"type":"object"}},"application/msgpack":{"schema":{"properties":{"visibility":{"default":"public","description":"Model visibility, public will be shown in the discovery page, unlist allows anyone with the link to access, private only be visible to the creator","enum":["public","unlist","private"],"title":"Visibility","type":"string"},"type":{"const":"tts","description":"Model type, tts is for text to speech","title":"Type","type":"string"},"title":{"description":"Model title or name","title":"Title","type":"string"},"description":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Model description","title":"Description"},"cover_image":{"anyOf":[{"format":"binary","type":"string"},{"type":"null"}],"default":null,"description":"Model cover image, this is required if the model is public","title":"Cover Image"},"train_mode":{"const":"fast","description":"Model train mode, for TTS model, fast means model instantly available after creation","title":"Train Mode","type":"string"},"voices":{"anyOf":[{"items":{"format":"binary","type":"string"},"type":"array"},{"format":"binary","type":"string"}],"description":"Upload voices files that will be used to tune the model","title":"Voices"},"texts":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"},{"type":"null"}],"default":null,"description":"Texts corresponding to the voices, if unspecified, ASR will be performed on the voices","title":"Texts"},"tags":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"},{"type":"null"}],"description":"Model tags","title":"Tags"},"enhance_audio_quality":{"default":true,"description":"Enhance audio quality","title":"Enhance Audio Quality","type":"boolean"},"generate_sample":{"default":false,"description":"Generate default text","title":"Generate Sample","type":"boolean"}},"required":["type","title","train_mode","voices"],"type":"object"}}}},"responses":{"201":{"description":"Document created, URL follows","headers":{},"content":{"application/json":{"schema":{"properties":{"_id":{"title":"Id","type":"string"},"type":{"enum":["svc","tts"],"title":"Type","type":"string"},"title":{"title":"Title","type":"string"},"description":{"default":"","title":"Description","type":"string"},"cover_image":{"default":"","title":"Cover Image","type":"string"},"train_mode":{"default":"full","enum":["fast","full"],"title":"Train Mode","type":"string"},"state":{"enum":["created","training","trained","failed"],"title":"State","type":"string"},"tags":{"items":{"type":"string"},"title":"Tags","type":"array"},"samples":{"default":[],"items":{"$ref":"#/components/schemas/SampleEntity"},"title":"Samples","type":"array"},"created_at":{"format":"date-time","title":"Created At","type":"string"},"updated_at":{"format":"date-time","title":"Updated At","type":"string"},"languages":{"default":[],"items":{"type":"string"},"title":"Languages","type":"array"},"visibility":{"enum":["public","unlist","private"],"title":"Visibility","type":"string"},"lock_visibility":{"default":false,"title":"Lock Visibility","type":"boolean"},"dmca_taken_down":{"anyOf":[{"type":"boolean"},{"type":"null"}],"default":false,"title":"Dmca Taken Down"},"default_text":{"default":"","title":"Default Text","type":"string"},"quality":{"anyOf":[{"$ref":"#/components/schemas/ModelQualityEntity"},{"type":"null"}],"default":null},"like_count":{"title":"Like Count","type":"integer"},"mark_count":{"title":"Mark Count","type":"integer"},"shared_count":{"title":"Shared Count","type":"integer"},"task_count":{"title":"Task Count","type":"integer"},"unliked":{"default":false,"title":"Unliked","type":"boolean"},"liked":{"default":false,"title":"Liked","type":"boolean"},"marked":{"default":false,"title":"Marked","type":"boolean"},"author":{"$ref":"#/components/schemas/AuthorEntity"}},"required":["_id","type","title","state","tags","created_at","updated_at","visibility","like_count","mark_count","shared_count","task_count","author"],"type":"object"}}}},"401":{"description":"No permission -- see authorization schemes","headers":{},"content":{"application/json":{"schema":{"properties":{"status":{"title":"Status","type":"integer"},"message":{"title":"Message","type":"string"}},"required":["status","message"],"type":"object"}}}},"422":{"description":"","headers":{},"content":{"application/json":{"schema":{"type":"array","items":{"type":"object","properties":{"loc":{"title":"Location","description":"error field","type":"array","items":{"type":"string"}},"type":{"title":"Type","description":"error type","type":"string"},"msg":{"title":"Message","description":"error message","type":"string"},"ctx":{"title":"Context","description":"error context","type":"string"},"in":{"title":"In","type":"string","enum":["path","query","header","cookie","body"]}},"required":["loc","type","msg"]}}}}}},"tags":["Model"]}},"/model/{id}":{"get":{"summary":"Get Model","security":[{"BearerAuth":[]}],"parameters":[{"in":"path","name":"id","description":"","required":true,"schema":{"title":"Id","type":"string"},"deprecated":false}],"responses":{"200":{"description":"Request fulfilled, document follows","headers":{},"content":{"application/json":{"schema":{"properties":{"_id":{"title":"Id","type":"string"},"type":{"enum":["svc","tts"],"title":"Type","type":"string"},"title":{"title":"Title","type":"string"},"description":{"default":"","title":"Description","type":"string"},"cover_image":{"default":"","title":"Cover Image","type":"string"},"train_mode":{"default":"full","enum":["fast","full"],"title":"Train Mode","type":"string"},"state":{"enum":["created","training","trained","failed"],"title":"State","type":"string"},"tags":{"items":{"type":"string"},"title":"Tags","type":"array"},"samples":{"default":[],"items":{"$ref":"#/components/schemas/SampleEntity"},"title":"Samples","type":"array"},"created_at":{"format":"date-time","title":"Created At","type":"string"},"updated_at":{"format":"date-time","title":"Updated At","type":"string"},"languages":{"default":[],"items":{"type":"string"},"title":"Languages","type":"array"},"visibility":{"enum":["public","unlist","private"],"title":"Visibility","type":"string"},"lock_visibility":{"default":false,"title":"Lock Visibility","type":"boolean"},"dmca_taken_down":{"anyOf":[{"type":"boolean"},{"type":"null"}],"default":false,"title":"Dmca Taken Down"},"default_text":{"default":"","title":"Default Text","type":"string"},"quality":{"anyOf":[{"$ref":"#/components/schemas/ModelQualityEntity"},{"type":"null"}],"default":null},"like_count":{"title":"Like Count","type":"integer"},"mark_count":{"title":"Mark Count","type":"integer"},"shared_count":{"title":"Shared Count","type":"integer"},"task_count":{"title":"Task Count","type":"integer"},"unliked":{"default":false,"title":"Unliked","type":"boolean"},"liked":{"default":false,"title":"Liked","type":"boolean"},"marked":{"default":false,"title":"Marked","type":"boolean"},"author":{"$ref":"#/components/schemas/AuthorEntity"}},"required":["_id","type","title","state","tags","created_at","updated_at","visibility","like_count","mark_count","shared_count","task_count","author"],"type":"object"}}}},"404":{"description":"Nothing matches the given URI","headers":{},"content":{"application/json":{"schema":{"properties":{"status":{"title":"Status","type":"integer"},"message":{"title":"Message","type":"string"}},"required":["status","message"],"type":"object"}}}},"422":{"description":"","headers":{},"content":{"application/json":{"schema":{"type":"array","items":{"type":"object","properties":{"loc":{"title":"Location","description":"error field","type":"array","items":{"type":"string"}},"type":{"title":"Type","description":"error type","type":"string"},"msg":{"title":"Message","description":"error message","type":"string"},"ctx":{"title":"Context","description":"error context","type":"string"},"in":{"title":"In","type":"string","enum":["path","query","header","cookie","body"]}},"required":["loc","type","msg"]}}}}}},"tags":["Model"]},"patch":{"summary":"Update Model","security":[{"BearerAuth":[]}],"parameters":[{"in":"path","name":"id","description":"","required":true,"schema":{"title":"Id","type":"string"},"deprecated":false}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"properties":{"title":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Title"},"description":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Description"},"cover_image":{"anyOf":[{"format":"binary","type":"string"},{"type":"null"}],"default":null,"title":"Cover Image"},"visibility":{"anyOf":[{"enum":["public","unlist","private"],"type":"string"},{"type":"null"}],"default":null,"title":"Visibility"},"tags":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"}],"title":"Tags"}},"type":"object"}},"application/x-www-form-urlencoded":{"schema":{"properties":{"title":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Title"},"description":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Description"},"cover_image":{"anyOf":[{"format":"binary","type":"string"},{"type":"null"}],"default":null,"title":"Cover Image"},"visibility":{"anyOf":[{"enum":["public","unlist","private"],"type":"string"},{"type":"null"}],"default":null,"title":"Visibility"},"tags":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"}],"title":"Tags"}},"type":"object"}},"multipart/form-data":{"schema":{"properties":{"title":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Title"},"description":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Description"},"cover_image":{"anyOf":[{"format":"binary","type":"string"},{"type":"null"}],"default":null,"title":"Cover Image"},"visibility":{"anyOf":[{"enum":["public","unlist","private"],"type":"string"},{"type":"null"}],"default":null,"title":"Visibility"},"tags":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"}],"title":"Tags"}},"type":"object"}},"application/msgpack":{"schema":{"properties":{"title":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Title"},"description":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"title":"Description"},"cover_image":{"anyOf":[{"format":"binary","type":"string"},{"type":"null"}],"default":null,"title":"Cover Image"},"visibility":{"anyOf":[{"enum":["public","unlist","private"],"type":"string"},{"type":"null"}],"default":null,"title":"Visibility"},"tags":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"string"}],"title":"Tags"}},"type":"object"}}}},"responses":{"200":{"description":"Request fulfilled, document follows","headers":{}},"401":{"description":"No permission -- see authorization schemes","headers":{},"content":{"application/json":{"schema":{"properties":{"status":{"title":"Status","type":"integer"},"message":{"title":"Message","type":"string"}},"required":["status","message"],"type":"object"}}}},"422":{"description":"","headers":{},"content":{"application/json":{"schema":{"type":"array","items":{"type":"object","properties":{"loc":{"title":"Location","description":"error field","type":"array","items":{"type":"string"}},"type":{"title":"Type","description":"error type","type":"string"},"msg":{"title":"Message","description":"error message","type":"string"},"ctx":{"title":"Context","description":"error context","type":"string"},"in":{"title":"In","type":"string","enum":["path","query","header","cookie","body"]}},"required":["loc","type","msg"]}}}}}},"tags":["Model"]},"delete":{"summary":"Delete Model","security":[{"BearerAuth":[]}],"parameters":[{"in":"path","name":"id","description":"","required":true,"schema":{"title":"Id","type":"string"},"deprecated":false}],"responses":{"200":{"description":"Request fulfilled, document follows","headers":{}},"401":{"description":"No permission -- see authorization schemes","headers":{},"content":{"application/json":{"schema":{"properties":{"status":{"title":"Status","type":"integer"},"message":{"title":"Message","type":"string"}},"required":["status","message"],"type":"object"}}}},"422":{"description":"","headers":{},"content":{"application/json":{"schema":{"type":"array","items":{"type":"object","properties":{"loc":{"title":"Location","description":"error field","type":"array","items":{"type":"string"}},"type":{"title":"Type","description":"error type","type":"string"},"msg":{"title":"Message","description":"error message","type":"string"},"ctx":{"title":"Context","description":"error context","type":"string"},"in":{"title":"In","type":"string","enum":["path","query","header","cookie","body"]}},"required":["loc","type","msg"]}}}}}},"tags":["Model"]}},"/v1/tts":{"post":{"summary":"Text to Speech","security":[{"BearerAuth":[]}],"parameters":[{"in":"header","name":"model","description":"Specify which TTS model to use. We recommend `s2-pro`.","required":true,"schema":{"default":"s2-pro","enum":["s1","s2-pro"],"title":"Model","type":"string"},"deprecated":false}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/TTSRequest"}},"application/msgpack":{"schema":{"$ref":"#/components/schemas/TTSRequest"}}}},"responses":{"200":{"description":"Request fulfilled, document follows","headers":{"Transfer-Encoding":{"schema":{"type":"string"},"description":"chunked"}}},"401":{"description":"No permission -- see authorization schemes","headers":{},"content":{"application/json":{"schema":{"properties":{"status":{"title":"Status","type":"integer"},"message":{"title":"Message","type":"string"}},"required":["status","message"],"type":"object"}}}},"402":{"description":"No payment -- see charging schemes","headers":{},"content":{"application/json":{"schema":{"properties":{"status":{"title":"Status","type":"integer"},"message":{"title":"Message","type":"string"}},"required":["status","message"],"type":"object"}}}},"422":{"description":"","headers":{},"content":{"application/json":{"schema":{"type":"array","items":{"type":"object","properties":{"loc":{"title":"Location","description":"error field","type":"array","items":{"type":"string"}},"type":{"title":"Type","description":"error type","type":"string"},"msg":{"title":"Message","description":"error message","type":"string"},"ctx":{"title":"Context","description":"error context","type":"string"},"in":{"title":"In","type":"string","enum":["path","query","header","cookie","body"]}},"required":["loc","type","msg"]}}}}}},"tags":["OpenAPI v1"],"x-codeSamples":[{"lang":"bash","label":"Single Speaker","source":"curl --request POST \\\n  --url https://api.fish.audio/v1/tts \\\n  --header 'Authorization: Bearer <token>' \\\n  --header 'Content-Type: application/json' \\\n  --header 'model: s2-pro' \\\n  --data '{\n    \"text\": \"Hello! Welcome to Fish Audio.\",\n    \"reference_id\": \"model-id\",\n    \"temperature\": 0.7,\n    \"top_p\": 0.7,\n    \"prosody\": {\n      \"speed\": 1,\n      \"volume\": 0,\n      \"normalize_loudness\": true\n    },\n    \"chunk_length\": 300,\n    \"normalize\": true,\n    \"format\": \"mp3\",\n    \"sample_rate\": 44100,\n    \"mp3_bitrate\": 128,\n    \"latency\": \"normal\",\n    \"max_new_tokens\": 1024,\n    \"repetition_penalty\": 1.2,\n    \"min_chunk_length\": 50,\n    \"condition_on_previous_chunks\": true,\n    \"early_stop_threshold\": 1\n  }'"},{"lang":"bash","label":"Multi Speaker (S2-Pro only)","source":"curl --request POST \\\n  --url https://api.fish.audio/v1/tts \\\n  --header 'Authorization: Bearer <token>' \\\n  --header 'Content-Type: application/json' \\\n  --header 'model: s2-pro' \\\n  --data '{\n    \"text\": \"<|speaker:0|>Hello!<|speaker:1|>Hi there!\",\n    \"reference_id\": [\"speaker-a-id\", \"speaker-b-id\"],\n    \"temperature\": 0.7,\n    \"top_p\": 0.7,\n    \"prosody\": {\n      \"speed\": 1,\n      \"volume\": 0,\n      \"normalize_loudness\": true\n    },\n    \"chunk_length\": 300,\n    \"normalize\": true,\n    \"format\": \"mp3\",\n    \"sample_rate\": 44100,\n    \"mp3_bitrate\": 128,\n    \"latency\": \"normal\",\n    \"max_new_tokens\": 1024,\n    \"repetition_penalty\": 1.2,\n    \"min_chunk_length\": 50,\n    \"condition_on_previous_chunks\": true,\n    \"early_stop_threshold\": 1\n  }'"}]}},"/v1/tts/stream/with-timestamp":{"post":{"summary":"Text to Speech Stream with Timestamps","security":[{"BearerAuth":[]}],"parameters":[{"in":"header","name":"model","description":"Specify which TTS model to use. We recommend `s2-pro`.","required":true,"schema":{"default":"s2-pro","enum":["s1","s2-pro"],"title":"Model","type":"string"},"deprecated":false}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"$ref":"#/components/schemas/TTSStreamWithTimestampRequest"}},"application/msgpack":{"schema":{"$ref":"#/components/schemas/TTSStreamWithTimestampRequest"}}}},"responses":{"200":{"description":"Server-Sent Events stream. Each `message` event contains a JSON payload with one base64 audio chunk. Concatenate every `audio_base64` chunk in arrival order to reconstruct the complete audio. `alignment` is the latest cumulative timestamp snapshot for `chunk_seq`; clients should replace the previous snapshot for that chunk instead of appending segments. `chunk_audio_offset_sec` can be added to segment times to derive absolute timestamps in the full audio.","headers":{"Transfer-Encoding":{"schema":{"type":"string"},"description":"chunked"}},"content":{"text/event-stream":{"schema":{"description":"One Server-Sent Events message payload for streaming TTS with timestamps. Each event contains one audio chunk. Concatenate all `audio_base64` chunks in arrival order to reconstruct the complete audio. `alignment` is the latest cumulative timestamp snapshot for the reported `chunk_seq`; clients should replace the previous snapshot for that chunk instead of appending segments.","examples":[{"alignment":{"audio_duration":16.24,"segments":[{"end":0.16,"start":0.0,"text":"I"},{"end":0.48,"start":0.16,"text":"can't"},{"end":0.8,"start":0.48,"text":"believe"},{"end":1.12,"start":0.8,"text":"its"},{"end":1.44,"start":1.2,"text":"been"},{"end":1.76,"start":1.44,"text":"this"},{"end":2.48,"start":1.76,"text":"long"},{"end":2.64,"start":2.56,"text":"It"},{"end":3.04,"start":2.72,"text":"feels"},{"end":3.28,"start":3.12,"text":"like"},{"end":4.0,"start":3.36,"text":"forever"},{"end":4.32,"start":4.0,"text":"since"},{"end":4.48,"start":4.32,"text":"we"},{"end":4.96,"start":4.48,"text":"last"},{"end":5.28,"start":4.96,"text":"really"},{"end":5.84,"start":5.28,"text":"talked"},{"end":6.24,"start":6.0,"text":"Ive"},{"end":6.64,"start":6.24,"text":"missed"},{"end":6.96,"start":6.64,"text":"hearing"},{"end":7.2,"start":6.96,"text":"your"},{"end":7.76,"start":7.2,"text":"voice"},{"end":7.92,"start":7.76,"text":"your"},{"end":8.48,"start":7.92,"text":"stories"},{"end":8.72,"start":8.48,"text":"even"},{"end":8.8,"start":8.72,"text":"the"},{"end":9.2,"start":8.8,"text":"little"},{"end":9.52,"start":9.2,"text":"things"},{"end":9.68,"start":9.52,"text":"you"},{"end":10.0,"start":9.68,"text":"used"},{"end":10.08,"start":10.0,"text":"to"},{"end":10.64,"start":10.08,"text":"say"},{"end":10.96,"start":10.64,"text":"How"},{"end":11.12,"start":10.96,"text":"have"},{"end":11.36,"start":11.12,"text":"you"},{"end":11.92,"start":11.36,"text":"been"},{"end":12.24,"start":12.0,"text":"Ive"},{"end":12.48,"start":12.24,"text":"thought"},{"end":12.8,"start":12.48,"text":"about"},{"end":13.2,"start":12.8,"text":"calling"},{"end":13.36,"start":13.2,"text":"you"},{"end":13.68,"start":13.36,"text":"so"},{"end":13.92,"start":13.68,"text":"many"},{"end":14.56,"start":13.92,"text":"times"},{"end":14.72,"start":14.56,"text":"but"},{"end":14.88,"start":14.72,"text":"I"},{"end":15.2,"start":14.88,"text":"never"},{"end":15.36,"start":15.2,"text":"knew"},{"end":15.6,"start":15.36,"text":"where"},{"end":15.6,"start":15.6,"text":"to"},{"end":16.24,"start":15.68,"text":"start"}]},"audio_base64":"SUQzBAAAAAAA...","chunk_audio_offset_sec":0.0,"chunk_seq":0,"content":"I can’t believe it’s been this long. It feels like forever since we last really talked. I’ve missed hearing your voice, your stories, even the little things you used to say. How have you been? I’ve thought about calling you so many times, but I never knew where to start."},{"alignment":{"audio_duration":16.24,"segments":[{"end":0.16,"start":0.0,"text":"I"},{"end":0.48,"start":0.16,"text":"can't"},{"end":0.8,"start":0.48,"text":"believe"},{"end":1.12,"start":0.8,"text":"its"},{"end":1.44,"start":1.2,"text":"been"},{"end":1.76,"start":1.44,"text":"this"},{"end":2.48,"start":1.76,"text":"long"},{"end":2.64,"start":2.56,"text":"It"},{"end":3.04,"start":2.72,"text":"feels"},{"end":3.28,"start":3.12,"text":"like"},{"end":4.0,"start":3.36,"text":"forever"},{"end":4.32,"start":4.0,"text":"since"},{"end":4.48,"start":4.32,"text":"we"},{"end":4.96,"start":4.48,"text":"last"},{"end":5.28,"start":4.96,"text":"really"},{"end":5.84,"start":5.28,"text":"talked"},{"end":6.24,"start":6.0,"text":"Ive"},{"end":6.64,"start":6.24,"text":"missed"},{"end":6.96,"start":6.64,"text":"hearing"},{"end":7.2,"start":6.96,"text":"your"},{"end":7.76,"start":7.2,"text":"voice"},{"end":7.92,"start":7.76,"text":"your"},{"end":8.48,"start":7.92,"text":"stories"},{"end":8.72,"start":8.48,"text":"even"},{"end":8.8,"start":8.72,"text":"the"},{"end":9.2,"start":8.8,"text":"little"},{"end":9.52,"start":9.2,"text":"things"},{"end":9.68,"start":9.52,"text":"you"},{"end":10.0,"start":9.68,"text":"used"},{"end":10.08,"start":10.0,"text":"to"},{"end":10.64,"start":10.08,"text":"say"},{"end":10.96,"start":10.64,"text":"How"},{"end":11.12,"start":10.96,"text":"have"},{"end":11.36,"start":11.12,"text":"you"},{"end":11.92,"start":11.36,"text":"been"},{"end":12.24,"start":12.0,"text":"Ive"},{"end":12.48,"start":12.24,"text":"thought"},{"end":12.8,"start":12.48,"text":"about"},{"end":13.2,"start":12.8,"text":"calling"},{"end":13.36,"start":13.2,"text":"you"},{"end":13.68,"start":13.36,"text":"so"},{"end":13.92,"start":13.68,"text":"many"},{"end":14.56,"start":13.92,"text":"times"},{"end":14.72,"start":14.56,"text":"but"},{"end":14.88,"start":14.72,"text":"I"},{"end":15.2,"start":14.88,"text":"never"},{"end":15.36,"start":15.2,"text":"knew"},{"end":15.6,"start":15.36,"text":"where"},{"end":15.6,"start":15.6,"text":"to"},{"end":16.24,"start":15.68,"text":"start"}]},"audio_base64":"//uSxOAAF...","chunk_audio_offset_sec":0.0,"chunk_seq":0,"content":"I can’t believe it’s been this long. It feels like forever since we last really talked. I’ve missed hearing your voice, your stories, even the little things you used to say. How have you been? I’ve thought about calling you so many times, but I never knew where to start."},{"alignment":{"audio_duration":10.48,"segments":[{"end":0.8,"start":0.4,"text":"Seeing"},{"end":0.96,"start":0.8,"text":"you"},{"end":1.44,"start":0.96,"text":"again"},{"end":1.68,"start":1.44,"text":"now"},{"end":2.08,"start":1.68,"text":"makes"},{"end":2.24,"start":2.08,"text":"me"},{"end":2.8,"start":2.24,"text":"realize"},{"end":3.12,"start":2.8,"text":"just"},{"end":3.28,"start":3.12,"text":"how"},{"end":3.6,"start":3.28,"text":"much"},{"end":3.76,"start":3.6,"text":"Ive"},{"end":4.24,"start":3.84,"text":"missed"},{"end":4.56,"start":4.24,"text":"you"},{"end":4.8,"start":4.64,"text":"We"},{"end":5.04,"start":4.8,"text":"have"},{"end":5.36,"start":5.04,"text":"so"},{"end":5.76,"start":5.36,"text":"much"},{"end":5.76,"start":5.76,"text":"to"},{"end":6.16,"start":5.76,"text":"catch"},{"end":6.4,"start":6.16,"text":"up"},{"end":6.72,"start":6.4,"text":"on"},{"end":6.96,"start":6.8,"text":"and"},{"end":7.04,"start":6.96,"text":"I"},{"end":7.36,"start":7.04,"text":"dont"},{"end":7.6,"start":7.36,"text":"even"},{"end":7.84,"start":7.6,"text":"know"},{"end":8.08,"start":7.84,"text":"which"},{"end":8.4,"start":8.08,"text":"part"},{"end":8.48,"start":8.4,"text":"of"},{"end":8.72,"start":8.56,"text":"my"},{"end":8.96,"start":8.72,"text":"life"},{"end":9.12,"start":9.12,"text":"to"},{"end":9.44,"start":9.12,"text":"tell"},{"end":9.6,"start":9.44,"text":"you"},{"end":10.0,"start":9.6,"text":"about"},{"end":10.48,"start":10.08,"text":"first"}]},"audio_base64":"//uSxImAl...","chunk_audio_offset_sec":16.24,"chunk_seq":1,"content":"Seeing you again now makes me realize just how much I’ve missed you. We have so much to catch up on, and I don’t even know which part of my life to tell you about first."}],"properties":{"audio_base64":{"description":"Base64 encoded audio chunk. Concatenate every chunk in event order to reconstruct the full audio.","title":"Audio Base64","type":"string"},"content":{"description":"Text content described by this event's latest alignment snapshot. Long input may be split into multiple content chunks in one stream.","title":"Content","type":"string"},"alignment":{"anyOf":[{"$ref":"#/components/schemas/TTSTimestampAlignment"},{"type":"null"}],"description":"Latest cumulative timestamp snapshot for `chunk_seq`. When present, replace the previous alignment for the same `chunk_seq`; do not append segments. Null means no alignment snapshot has been produced yet or alignment is unavailable."},"chunk_seq":{"description":"Sequence number of the text chunk described by `alignment`. Clients should bucket alignment snapshots by this value.","minimum":0,"title":"Chunk Seq","type":"integer"},"chunk_audio_offset_sec":{"description":"Absolute start time of this text chunk within the full audio, in seconds.","minimum":0.0,"title":"Chunk Audio Offset Sec","type":"number"}},"required":["audio_base64","content","alignment","chunk_seq","chunk_audio_offset_sec"],"title":"TTSTimestampStreamEvent","type":"object"},"examples":{"first_event":{"summary":"First text chunk event with alignment","value":"data: {\"audio_base64\": \"SUQzBAAAAAAA...\", \"content\": \"I can’t believe it’s been this long. It feels like forever since we last really talked. I’ve missed hearing your voice, your stories, even the little things you used to say. How have you been? I’ve thought about calling you so many times, but I never knew where to start.\", \"alignment\": {\"segments\": [{\"text\": \"I\", \"start\": 0.0, \"end\": 0.16}, {\"text\": \"can't\", \"start\": 0.16, \"end\": 0.48}, {\"text\": \"believe\", \"start\": 0.48, \"end\": 0.8}, {\"text\": \"its\", \"start\": 0.8, \"end\": 1.12}, {\"text\": \"been\", \"start\": 1.2, \"end\": 1.44}, {\"text\": \"this\", \"start\": 1.44, \"end\": 1.76}, {\"text\": \"long\", \"start\": 1.76, \"end\": 2.48}, {\"text\": \"It\", \"start\": 2.56, \"end\": 2.64}, {\"text\": \"feels\", \"start\": 2.72, \"end\": 3.04}, {\"text\": \"like\", \"start\": 3.12, \"end\": 3.28}, {\"text\": \"forever\", \"start\": 3.36, \"end\": 4.0}, {\"text\": \"since\", \"start\": 4.0, \"end\": 4.32}, {\"text\": \"we\", \"start\": 4.32, \"end\": 4.48}, {\"text\": \"last\", \"start\": 4.48, \"end\": 4.96}, {\"text\": \"really\", \"start\": 4.96, \"end\": 5.28}, {\"text\": \"talked\", \"start\": 5.28, \"end\": 5.84}, {\"text\": \"Ive\", \"start\": 6.0, \"end\": 6.24}, {\"text\": \"missed\", \"start\": 6.24, \"end\": 6.64}, {\"text\": \"hearing\", \"start\": 6.64, \"end\": 6.96}, {\"text\": \"your\", \"start\": 6.96, \"end\": 7.2}, {\"text\": \"voice\", \"start\": 7.2, \"end\": 7.76}, {\"text\": \"your\", \"start\": 7.76, \"end\": 7.92}, {\"text\": \"stories\", \"start\": 7.92, \"end\": 8.48}, {\"text\": \"even\", \"start\": 8.48, \"end\": 8.72}, {\"text\": \"the\", \"start\": 8.72, \"end\": 8.8}, {\"text\": \"little\", \"start\": 8.8, \"end\": 9.2}, {\"text\": \"things\", \"start\": 9.2, \"end\": 9.52}, {\"text\": \"you\", \"start\": 9.52, \"end\": 9.68}, {\"text\": \"used\", \"start\": 9.68, \"end\": 10.0}, {\"text\": \"to\", \"start\": 10.0, \"end\": 10.08}, {\"text\": \"say\", \"start\": 10.08, \"end\": 10.64}, {\"text\": \"How\", \"start\": 10.64, \"end\": 10.96}, {\"text\": \"have\", \"start\": 10.96, \"end\": 11.12}, {\"text\": \"you\", \"start\": 11.12, \"end\": 11.36}, {\"text\": \"been\", \"start\": 11.36, \"end\": 11.92}, {\"text\": \"Ive\", \"start\": 12.0, \"end\": 12.24}, {\"text\": \"thought\", \"start\": 12.24, \"end\": 12.48}, {\"text\": \"about\", \"start\": 12.48, \"end\": 12.8}, {\"text\": \"calling\", \"start\": 12.8, \"end\": 13.2}, {\"text\": \"you\", \"start\": 13.2, \"end\": 13.36}, {\"text\": \"so\", \"start\": 13.36, \"end\": 13.68}, {\"text\": \"many\", \"start\": 13.68, \"end\": 13.92}, {\"text\": \"times\", \"start\": 13.92, \"end\": 14.56}, {\"text\": \"but\", \"start\": 14.56, \"end\": 14.72}, {\"text\": \"I\", \"start\": 14.72, \"end\": 14.88}, {\"text\": \"never\", \"start\": 14.88, \"end\": 15.2}, {\"text\": \"knew\", \"start\": 15.2, \"end\": 15.36}, {\"text\": \"where\", \"start\": 15.36, \"end\": 15.6}, {\"text\": \"to\", \"start\": 15.6, \"end\": 15.6}, {\"text\": \"start\", \"start\": 15.68, \"end\": 16.24}], \"audio_duration\": 16.24}, \"chunk_seq\": 0, \"chunk_audio_offset_sec\": 0.0}\n\n"},"following_event":{"summary":"Following audio event with latest alignment snapshot","value":"data: {\"audio_base64\": \"//uSxOAAF...\", \"content\": \"I can’t believe it’s been this long. It feels like forever since we last really talked. I’ve missed hearing your voice, your stories, even the little things you used to say. How have you been? I’ve thought about calling you so many times, but I never knew where to start.\", \"alignment\": {\"segments\": [{\"text\": \"I\", \"start\": 0.0, \"end\": 0.16}, {\"text\": \"can't\", \"start\": 0.16, \"end\": 0.48}, {\"text\": \"believe\", \"start\": 0.48, \"end\": 0.8}, {\"text\": \"its\", \"start\": 0.8, \"end\": 1.12}, {\"text\": \"been\", \"start\": 1.2, \"end\": 1.44}, {\"text\": \"this\", \"start\": 1.44, \"end\": 1.76}, {\"text\": \"long\", \"start\": 1.76, \"end\": 2.48}, {\"text\": \"It\", \"start\": 2.56, \"end\": 2.64}, {\"text\": \"feels\", \"start\": 2.72, \"end\": 3.04}, {\"text\": \"like\", \"start\": 3.12, \"end\": 3.28}, {\"text\": \"forever\", \"start\": 3.36, \"end\": 4.0}, {\"text\": \"since\", \"start\": 4.0, \"end\": 4.32}, {\"text\": \"we\", \"start\": 4.32, \"end\": 4.48}, {\"text\": \"last\", \"start\": 4.48, \"end\": 4.96}, {\"text\": \"really\", \"start\": 4.96, \"end\": 5.28}, {\"text\": \"talked\", \"start\": 5.28, \"end\": 5.84}, {\"text\": \"Ive\", \"start\": 6.0, \"end\": 6.24}, {\"text\": \"missed\", \"start\": 6.24, \"end\": 6.64}, {\"text\": \"hearing\", \"start\": 6.64, \"end\": 6.96}, {\"text\": \"your\", \"start\": 6.96, \"end\": 7.2}, {\"text\": \"voice\", \"start\": 7.2, \"end\": 7.76}, {\"text\": \"your\", \"start\": 7.76, \"end\": 7.92}, {\"text\": \"stories\", \"start\": 7.92, \"end\": 8.48}, {\"text\": \"even\", \"start\": 8.48, \"end\": 8.72}, {\"text\": \"the\", \"start\": 8.72, \"end\": 8.8}, {\"text\": \"little\", \"start\": 8.8, \"end\": 9.2}, {\"text\": \"things\", \"start\": 9.2, \"end\": 9.52}, {\"text\": \"you\", \"start\": 9.52, \"end\": 9.68}, {\"text\": \"used\", \"start\": 9.68, \"end\": 10.0}, {\"text\": \"to\", \"start\": 10.0, \"end\": 10.08}, {\"text\": \"say\", \"start\": 10.08, \"end\": 10.64}, {\"text\": \"How\", \"start\": 10.64, \"end\": 10.96}, {\"text\": \"have\", \"start\": 10.96, \"end\": 11.12}, {\"text\": \"you\", \"start\": 11.12, \"end\": 11.36}, {\"text\": \"been\", \"start\": 11.36, \"end\": 11.92}, {\"text\": \"Ive\", \"start\": 12.0, \"end\": 12.24}, {\"text\": \"thought\", \"start\": 12.24, \"end\": 12.48}, {\"text\": \"about\", \"start\": 12.48, \"end\": 12.8}, {\"text\": \"calling\", \"start\": 12.8, \"end\": 13.2}, {\"text\": \"you\", \"start\": 13.2, \"end\": 13.36}, {\"text\": \"so\", \"start\": 13.36, \"end\": 13.68}, {\"text\": \"many\", \"start\": 13.68, \"end\": 13.92}, {\"text\": \"times\", \"start\": 13.92, \"end\": 14.56}, {\"text\": \"but\", \"start\": 14.56, \"end\": 14.72}, {\"text\": \"I\", \"start\": 14.72, \"end\": 14.88}, {\"text\": \"never\", \"start\": 14.88, \"end\": 15.2}, {\"text\": \"knew\", \"start\": 15.2, \"end\": 15.36}, {\"text\": \"where\", \"start\": 15.36, \"end\": 15.6}, {\"text\": \"to\", \"start\": 15.6, \"end\": 15.6}, {\"text\": \"start\", \"start\": 15.68, \"end\": 16.24}], \"audio_duration\": 16.24}, \"chunk_seq\": 0, \"chunk_audio_offset_sec\": 0.0}\n\n"},"later_text_chunk_event":{"summary":"Later text chunk event with another alignment","value":"data: {\"audio_base64\": \"//uSxImAl...\", \"content\": \"Seeing you again now makes me realize just how much I’ve missed you. We have so much to catch up on, and I don’t even know which part of my life to tell you about first.\", \"alignment\": {\"segments\": [{\"text\": \"Seeing\", \"start\": 0.4, \"end\": 0.8}, {\"text\": \"you\", \"start\": 0.8, \"end\": 0.96}, {\"text\": \"again\", \"start\": 0.96, \"end\": 1.44}, {\"text\": \"now\", \"start\": 1.44, \"end\": 1.68}, {\"text\": \"makes\", \"start\": 1.68, \"end\": 2.08}, {\"text\": \"me\", \"start\": 2.08, \"end\": 2.24}, {\"text\": \"realize\", \"start\": 2.24, \"end\": 2.8}, {\"text\": \"just\", \"start\": 2.8, \"end\": 3.12}, {\"text\": \"how\", \"start\": 3.12, \"end\": 3.28}, {\"text\": \"much\", \"start\": 3.28, \"end\": 3.6}, {\"text\": \"Ive\", \"start\": 3.6, \"end\": 3.76}, {\"text\": \"missed\", \"start\": 3.84, \"end\": 4.24}, {\"text\": \"you\", \"start\": 4.24, \"end\": 4.56}, {\"text\": \"We\", \"start\": 4.64, \"end\": 4.8}, {\"text\": \"have\", \"start\": 4.8, \"end\": 5.04}, {\"text\": \"so\", \"start\": 5.04, \"end\": 5.36}, {\"text\": \"much\", \"start\": 5.36, \"end\": 5.76}, {\"text\": \"to\", \"start\": 5.76, \"end\": 5.76}, {\"text\": \"catch\", \"start\": 5.76, \"end\": 6.16}, {\"text\": \"up\", \"start\": 6.16, \"end\": 6.4}, {\"text\": \"on\", \"start\": 6.4, \"end\": 6.72}, {\"text\": \"and\", \"start\": 6.8, \"end\": 6.96}, {\"text\": \"I\", \"start\": 6.96, \"end\": 7.04}, {\"text\": \"dont\", \"start\": 7.04, \"end\": 7.36}, {\"text\": \"even\", \"start\": 7.36, \"end\": 7.6}, {\"text\": \"know\", \"start\": 7.6, \"end\": 7.84}, {\"text\": \"which\", \"start\": 7.84, \"end\": 8.08}, {\"text\": \"part\", \"start\": 8.08, \"end\": 8.4}, {\"text\": \"of\", \"start\": 8.4, \"end\": 8.48}, {\"text\": \"my\", \"start\": 8.56, \"end\": 8.72}, {\"text\": \"life\", \"start\": 8.72, \"end\": 8.96}, {\"text\": \"to\", \"start\": 9.12, \"end\": 9.12}, {\"text\": \"tell\", \"start\": 9.12, \"end\": 9.44}, {\"text\": \"you\", \"start\": 9.44, \"end\": 9.6}, {\"text\": \"about\", \"start\": 9.6, \"end\": 10.0}, {\"text\": \"first\", \"start\": 10.08, \"end\": 10.48}], \"audio_duration\": 10.48}, \"chunk_seq\": 1, \"chunk_audio_offset_sec\": 16.24}\n\n"}}}}},"401":{"description":"No permission -- see authorization schemes","headers":{},"content":{"application/json":{"schema":{"properties":{"status":{"title":"Status","type":"integer"},"message":{"title":"Message","type":"string"}},"required":["status","message"],"type":"object"}}}},"402":{"description":"No payment -- see charging schemes","headers":{},"content":{"application/json":{"schema":{"properties":{"status":{"title":"Status","type":"integer"},"message":{"title":"Message","type":"string"}},"required":["status","message"],"type":"object"}}}},"422":{"description":"","headers":{},"content":{"application/json":{"schema":{"type":"array","items":{"type":"object","properties":{"loc":{"title":"Location","description":"error field","type":"array","items":{"type":"string"}},"type":{"title":"Type","description":"error type","type":"string"},"msg":{"title":"Message","description":"error message","type":"string"},"ctx":{"title":"Context","description":"error context","type":"string"},"in":{"title":"In","type":"string","enum":["path","query","header","cookie","body"]}},"required":["loc","type","msg"]}}}}}},"tags":["OpenAPI v1"],"x-codeSamples":[{"lang":"bash","label":"Stream With Timestamps","source":"curl --no-buffer --request POST \\\n  --url https://api.fish.audio/v1/tts/stream/with-timestamp \\\n  --header 'Authorization: Bearer <token>' \\\n  --header 'Content-Type: application/json' \\\n  --header 'model: s2-pro' \\\n  --data '{\n    \"text\": \"[happy] I can’t believe it’s been this long. It feels like forever since we last really talked. I’ve missed hearing your voice, your stories, even the little things you used to say. How have you been? I’ve thought about calling you so many times, but I never knew where to start. Seeing you again now makes me realize just how much I’ve missed you. We have so much to catch up on, and I don’t even know which part of my life to tell you about first.\",\n    \"format\": \"opus\",\n    \"normalize\": true,\n    \"temperature\": 0.9,\n    \"chunk_length\": 100,\n    \"top_p\": 0.9,\n    \"latency\": \"balanced\",\n    \"sample_rate\": 48000,\n    \"reference_id\": \"fbe02f8306fc4d3d915e9871722a39d5\"\n  }'"}]}},"/v1/asr":{"post":{"summary":"Speech to Text","security":[{"BearerAuth":[]}],"requestBody":{"required":true,"content":{"application/json":{"schema":{"properties":{"audio":{"description":"Audio to be converted to text","format":"binary","title":"Audio","type":"string"},"language":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Language to be used for the speech","title":"Language"},"ignore_timestamps":{"default":true,"description":"Whether to return precise timestamps in the text, this will increase the latency in audio shorter than 30 seconds","title":"Ignore Timestamps","type":"boolean"}},"required":["audio"],"type":"object"}},"application/msgpack":{"schema":{"properties":{"audio":{"description":"Audio to be converted to text","format":"binary","title":"Audio","type":"string"},"language":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Language to be used for the speech","title":"Language"},"ignore_timestamps":{"default":true,"description":"Whether to return precise timestamps in the text, this will increase the latency in audio shorter than 30 seconds","title":"Ignore Timestamps","type":"boolean"}},"required":["audio"],"type":"object"}}}},"responses":{"200":{"description":"Request fulfilled, document follows","headers":{},"content":{"application/json":{"schema":{"properties":{"text":{"title":"Text","type":"string"},"duration":{"description":"Duration of the audio in seconds","title":"Duration","type":"number"},"segments":{"items":{"$ref":"#/components/schemas/ASRSegment"},"title":"Segments","type":"array"}},"required":["text","duration","segments"],"type":"object"}}}},"401":{"description":"No permission -- see authorization schemes","headers":{},"content":{"application/json":{"schema":{"properties":{"status":{"title":"Status","type":"integer"},"message":{"title":"Message","type":"string"}},"required":["status","message"],"type":"object"}}}},"402":{"description":"No payment -- see charging schemes","headers":{},"content":{"application/json":{"schema":{"properties":{"status":{"title":"Status","type":"integer"},"message":{"title":"Message","type":"string"}},"required":["status","message"],"type":"object"}}}},"422":{"description":"","headers":{},"content":{"application/json":{"schema":{"type":"array","items":{"type":"object","properties":{"loc":{"title":"Location","description":"error field","type":"array","items":{"type":"string"}},"type":{"title":"Type","description":"error type","type":"string"},"msg":{"title":"Message","description":"error message","type":"string"},"ctx":{"title":"Context","description":"error context","type":"string"},"in":{"title":"In","type":"string","enum":["path","query","header","cookie","body"]}},"required":["loc","type","msg"]}}}}}},"tags":["OpenAPI v1"]}}},"tags":[],"components":{"securitySchemes":{"BearerAuth":{"type":"http","scheme":"bearer"}},"schemas":{"AuthorEntity":{"properties":{"_id":{"title":"Id","type":"string"},"nickname":{"title":"Nickname","type":"string"},"avatar":{"title":"Avatar","type":"string"}},"required":["_id","nickname","avatar"],"title":"AuthorEntity","type":"object"},"ModelAudioQualityEntity":{"properties":{"filename":{"title":"Filename","type":"string"},"duration_ms":{"title":"Duration Ms","type":"number"},"language":{"default":"unknown","title":"Language","type":"string"},"quality":{"additionalProperties":{"type":"number"},"title":"Quality","type":"object"},"quality_passed":{"default":false,"title":"Quality Passed","type":"boolean"},"quality_reason":{"default":"","title":"Quality Reason","type":"string"}},"required":["filename","duration_ms"],"title":"ModelAudioQualityEntity","type":"object"},"ModelEntity":{"properties":{"_id":{"title":"Id","type":"string"},"type":{"enum":["svc","tts"],"title":"Type","type":"string"},"title":{"title":"Title","type":"string"},"description":{"default":"","title":"Description","type":"string"},"cover_image":{"default":"","title":"Cover Image","type":"string"},"train_mode":{"default":"full","enum":["fast","full"],"title":"Train Mode","type":"string"},"state":{"enum":["created","training","trained","failed"],"title":"State","type":"string"},"tags":{"items":{"type":"string"},"title":"Tags","type":"array"},"samples":{"default":[],"items":{"$ref":"#/components/schemas/SampleEntity"},"title":"Samples","type":"array"},"created_at":{"format":"date-time","title":"Created At","type":"string"},"updated_at":{"format":"date-time","title":"Updated At","type":"string"},"languages":{"default":[],"items":{"type":"string"},"title":"Languages","type":"array"},"visibility":{"enum":["public","unlist","private"],"title":"Visibility","type":"string"},"lock_visibility":{"default":false,"title":"Lock Visibility","type":"boolean"},"dmca_taken_down":{"anyOf":[{"type":"boolean"},{"type":"null"}],"default":false,"title":"Dmca Taken Down"},"default_text":{"default":"","title":"Default Text","type":"string"},"quality":{"anyOf":[{"$ref":"#/components/schemas/ModelQualityEntity"},{"type":"null"}],"default":null},"like_count":{"title":"Like Count","type":"integer"},"mark_count":{"title":"Mark Count","type":"integer"},"shared_count":{"title":"Shared Count","type":"integer"},"task_count":{"title":"Task Count","type":"integer"},"unliked":{"default":false,"title":"Unliked","type":"boolean"},"liked":{"default":false,"title":"Liked","type":"boolean"},"marked":{"default":false,"title":"Marked","type":"boolean"},"author":{"$ref":"#/components/schemas/AuthorEntity"}},"required":["_id","type","title","state","tags","created_at","updated_at","visibility","like_count","mark_count","shared_count","task_count","author"],"title":"ModelEntity","type":"object"},"ModelQualityEntity":{"properties":{"audios":{"items":{"$ref":"#/components/schemas/ModelAudioQualityEntity"},"title":"Audios","type":"array"},"created_at":{"format":"date-time","title":"Created At","type":"string"},"updated_at":{"format":"date-time","title":"Updated At","type":"string"}},"required":["created_at","updated_at"],"title":"ModelQualityEntity","type":"object"},"SampleEntity":{"properties":{"title":{"title":"Title","type":"string"},"text":{"title":"Text","type":"string"},"task_id":{"title":"Task Id","type":"string"},"audio":{"title":"Audio","type":"string"}},"required":["title","text","task_id","audio"],"title":"SampleEntity","type":"object"},"ProsodyControl":{"description":"Controls for adjusting the prosody (rhythm and intonation) of generated speech.","properties":{"speed":{"default":1.0,"description":"Speaking rate multiplier. Valid range: 0.5 to 2.0. 1.0 = normal speed, 0.5 = half speed, 2.0 = double speed. Useful for adjusting pacing without regenerating audio.","title":"Speed","type":"number"},"volume":{"default":0.0,"description":"Volume adjustment in decibels (dB). 0 = no change, positive values = louder, negative values = quieter.","title":"Volume","type":"number"},"normalize_loudness":{"default":true,"description":"Normalize output loudness for more consistent perceived volume. **S2-Pro only.**","title":"Normalize Loudness","type":"boolean"}},"title":"ProsodyControl","type":"object"},"ReferenceAudio":{"description":"A voice sample with its transcript, used for zero-shot voice cloning. The model will attempt to match the voice characteristics from the audio sample.","properties":{"audio":{"description":"Raw audio bytes of the voice sample. Supported formats: WAV, MP3, FLAC. For best results, use 10-30 seconds of clear speech with minimal background noise.","format":"binary","title":"Audio","type":"string"},"text":{"description":"The exact transcript of what is spoken in the audio sample. Accuracy is important for voice cloning quality.","title":"Text","type":"string"}},"required":["audio","text"],"title":"ReferenceAudio","type":"object"},"TTSRequest":{"description":"Request body for text-to-speech synthesis. Supports single-speaker synthesis on all compatible TTS models. Multi-speaker dialogue synthesis is only available with the S2-Pro model.\n\n## Single Speaker\nProvide either `reference_id` (string) pointing to a voice model, or `references` (array of ReferenceAudio) for zero-shot cloning.\n\n## Multiple Speakers (Dialogue, S2-Pro only)\nFor multi-speaker synthesis, provide:\n- `reference_id`: array of voice model IDs, e.g., [\"speaker-0-id\", \"speaker-1-id\"]\n- `text`: use speaker tags `<|speaker:0|>`, `<|speaker:1|>`, etc. to indicate speaker changes, e.g., \"<|speaker:0|>Hello!<|speaker:1|>Hi there!\"\n\nAlternatively, for zero-shot multi-speaker:\n- `references`: 2D array where each inner array contains references for one speaker\n- `reference_id`: array of identifiers (can be arbitrary strings for zero-shot)\n\n## Example (Multi-Speaker with Model IDs)\n```json\n{\n  \"text\": \"<|speaker:0|>Good morning!<|speaker:1|>Good morning! How are you?<|speaker:0|>I'm great, thanks!\",\n  \"reference_id\": [\"model-id-alice\", \"model-id-bob\"]\n}\n```","properties":{"text":{"description":"Text to convert to speech.","title":"Text","type":"string"},"temperature":{"default":0.7,"description":"Controls expressiveness. Higher is more varied, lower is more consistent.","maximum":1.0,"minimum":0.0,"title":"Temperature","type":"number"},"top_p":{"default":0.7,"description":"Controls diversity via nucleus sampling.","maximum":1.0,"minimum":0.0,"title":"Top P","type":"number"},"references":{"anyOf":[{"description":"Single speaker: array of reference audio samples","items":{"$ref":"#/components/schemas/ReferenceAudio"},"type":"array"},{"description":"Multiple speakers: array of arrays, where each inner array contains reference samples for one speaker","items":{"items":{"$ref":"#/components/schemas/ReferenceAudio"},"type":"array"},"type":"array"},{"type":"null"}],"description":"Inline voice references for zero-shot cloning. Requires MessagePack (not JSON). For single speaker, provide an array of ReferenceAudio objects. For multiple speakers, provide an array of arrays where each inner array contains references for one speaker. **Multi-speaker is only available with the S2-Pro model.** The speaker index corresponds to the index in reference_id array. Example for multi-speaker: [[{audio, text}], [{audio, text}, {audio, text}]] for 2 speakers where speaker 1 has 2 reference samples.","title":"References"},"reference_id":{"anyOf":[{"description":"Single speaker: voice model ID string","type":"string"},{"description":"Multiple speakers: array of voice model IDs, one per speaker","items":{"type":"string"},"type":"array"},{"type":"null"}],"default":null,"description":"Voice model ID(s) from Fish Audio library or your custom models. For single-speaker synthesis, provide a string. For multi-speaker synthesis (dialogue), provide an array of model IDs. **Multi-speaker is only available with the S2-Pro model.** When using multiple speakers, use speaker tags in your text like `<|speaker:0|>` and `<|speaker:1|>` to indicate speaker changes. Example: `<|speaker:0|>Hello!<|speaker:1|>Hi there!<|speaker:0|>How are you?` with `reference_id: [\"speaker-a-id\", \"speaker-b-id\"]`.","title":"Reference Id"},"prosody":{"anyOf":[{"$ref":"#/components/schemas/ProsodyControl"},{"type":"null"}],"default":null,"description":"Speed and volume adjustments for the output."},"chunk_length":{"default":300,"description":"Text segment size for processing.","maximum":300,"minimum":100,"title":"Chunk Length","type":"integer"},"normalize":{"default":true,"description":"Normalizes text for English and Chinese, improving stability for numbers.","title":"Normalize","type":"boolean"},"format":{"default":"mp3","description":"Output audio format.","enum":["wav","pcm","mp3","opus"],"title":"Format","type":"string"},"sample_rate":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Audio sample rate in Hz. When null, uses the format's default (44100 Hz for most formats, 48000 Hz for opus).","title":"Sample Rate"},"mp3_bitrate":{"default":128,"description":"MP3 bitrate in kbps. Only applies when format is mp3.","enum":[64,128,192],"title":"Mp3 Bitrate","type":"integer"},"opus_bitrate":{"default":-1000,"description":"Opus bitrate in bps. -1000 for automatic. Only applies when format is opus.","enum":[-1000,24000,32000,48000,64000],"title":"Opus Bitrate","type":"integer"},"latency":{"default":"normal","description":"Latency-quality trade-off. normal: best quality, balanced: reduced latency, low: lowest latency.","enum":["low","normal","balanced"],"title":"Latency","type":"string"},"max_new_tokens":{"default":1024,"description":"Maximum audio tokens to generate per text chunk.","title":"Max New Tokens","type":"integer"},"repetition_penalty":{"default":1.2,"description":"Penalty for repeating audio patterns. Values above 1.0 reduce repetition.","title":"Repetition Penalty","type":"number"},"min_chunk_length":{"default":50,"description":"Minimum characters before splitting into a new chunk.","maximum":100,"minimum":0,"title":"Min Chunk Length","type":"integer"},"condition_on_previous_chunks":{"default":true,"description":"Use previous audio as context for voice consistency.","title":"Condition On Previous Chunks","type":"boolean"},"early_stop_threshold":{"default":1.0,"description":"Early stopping threshold for batch processing.","maximum":1.0,"minimum":0.0,"title":"Early Stop Threshold","type":"number"}},"required":["text"],"title":"TTSRequest","type":"object"},"TTSStreamWithTimestampRequest":{"description":"Request body for streaming text-to-speech synthesis with timestamp alignment. The request fields match the standard TTS endpoint, but the response is delivered as a Server-Sent Events stream. Each SSE payload includes an audio chunk and, when available, the latest cumulative alignment snapshot for a `chunk_seq`. Clients should concatenate `audio_base64` chunks in arrival order and replace the stored alignment for each `chunk_seq` whenever a newer snapshot is received.","properties":{"text":{"description":"Text to convert to speech.","title":"Text","type":"string"},"temperature":{"default":0.7,"description":"Controls expressiveness. Higher is more varied, lower is more consistent.","maximum":1.0,"minimum":0.0,"title":"Temperature","type":"number"},"top_p":{"default":0.7,"description":"Controls diversity via nucleus sampling.","maximum":1.0,"minimum":0.0,"title":"Top P","type":"number"},"references":{"anyOf":[{"description":"Single speaker: array of reference audio samples","items":{"$ref":"#/components/schemas/ReferenceAudio"},"type":"array"},{"description":"Multiple speakers: array of arrays, where each inner array contains reference samples for one speaker","items":{"items":{"$ref":"#/components/schemas/ReferenceAudio"},"type":"array"},"type":"array"},{"type":"null"}],"description":"Inline voice references for zero-shot cloning. Requires MessagePack (not JSON). For single speaker, provide an array of ReferenceAudio objects. For multiple speakers, provide an array of arrays where each inner array contains references for one speaker. **Multi-speaker is only available with the S2-Pro model.** The speaker index corresponds to the index in reference_id array. Example for multi-speaker: [[{audio, text}], [{audio, text}, {audio, text}]] for 2 speakers where speaker 1 has 2 reference samples.","title":"References"},"reference_id":{"anyOf":[{"description":"Single speaker: voice model ID string","type":"string"},{"description":"Multiple speakers: array of voice model IDs, one per speaker","items":{"type":"string"},"type":"array"},{"type":"null"}],"default":null,"description":"Voice model ID(s) from Fish Audio library or your custom models. For single-speaker synthesis, provide a string. For multi-speaker synthesis (dialogue), provide an array of model IDs. **Multi-speaker is only available with the S2-Pro model.** When using multiple speakers, use speaker tags in your text like `<|speaker:0|>` and `<|speaker:1|>` to indicate speaker changes. Example: `<|speaker:0|>Hello!<|speaker:1|>Hi there!<|speaker:0|>How are you?` with `reference_id: [\"speaker-a-id\", \"speaker-b-id\"]`.","title":"Reference Id"},"prosody":{"anyOf":[{"$ref":"#/components/schemas/ProsodyControl"},{"type":"null"}],"default":null,"description":"Speed and volume adjustments for the output."},"chunk_length":{"default":300,"description":"Text segment size for processing.","maximum":300,"minimum":100,"title":"Chunk Length","type":"integer"},"normalize":{"default":true,"description":"Normalizes text for English and Chinese, improving stability for numbers.","title":"Normalize","type":"boolean"},"format":{"default":"mp3","description":"Output audio format.","enum":["wav","pcm","mp3","opus"],"title":"Format","type":"string"},"sample_rate":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Audio sample rate in Hz. When null, uses the format's default (44100 Hz for most formats, 48000 Hz for opus).","title":"Sample Rate"},"mp3_bitrate":{"default":128,"description":"MP3 bitrate in kbps. Only applies when format is mp3.","enum":[64,128,192],"title":"Mp3 Bitrate","type":"integer"},"opus_bitrate":{"default":-1000,"description":"Opus bitrate in bps. -1000 for automatic. Only applies when format is opus.","enum":[-1000,24000,32000,48000,64000],"title":"Opus Bitrate","type":"integer"},"latency":{"default":"normal","description":"Latency-quality trade-off. normal: best quality, balanced: reduced latency, low: lowest latency.","enum":["low","normal","balanced"],"title":"Latency","type":"string"},"max_new_tokens":{"default":1024,"description":"Maximum audio tokens to generate per text chunk.","title":"Max New Tokens","type":"integer"},"repetition_penalty":{"default":1.2,"description":"Penalty for repeating audio patterns. Values above 1.0 reduce repetition.","title":"Repetition Penalty","type":"number"},"min_chunk_length":{"default":50,"description":"Minimum characters before splitting into a new chunk.","maximum":100,"minimum":0,"title":"Min Chunk Length","type":"integer"},"condition_on_previous_chunks":{"default":true,"description":"Use previous audio as context for voice consistency.","title":"Condition On Previous Chunks","type":"boolean"},"early_stop_threshold":{"default":1.0,"description":"Early stopping threshold for batch processing.","maximum":1.0,"minimum":0.0,"title":"Early Stop Threshold","type":"number"}},"required":["text"],"title":"TTSStreamWithTimestampRequest","type":"object"},"TTSTimestampAlignment":{"properties":{"segments":{"description":"Ordered text timing segments for the generated audio.","items":{"$ref":"#/components/schemas/TTSTimestampSegment"},"title":"Segments","type":"array"},"audio_duration":{"description":"Audio duration in seconds for this alignment's content chunk.","title":"Audio Duration","type":"number"}},"required":["segments","audio_duration"],"title":"TTSTimestampAlignment","type":"object"},"TTSTimestampSegment":{"properties":{"text":{"description":"Text segment covered by this timing entry.","title":"Text","type":"string"},"start":{"description":"Segment start time in seconds.","title":"Start","type":"number"},"end":{"description":"Segment end time in seconds.","title":"End","type":"number"}},"required":["text","start","end"],"title":"TTSTimestampSegment","type":"object"},"ASRSegment":{"properties":{"text":{"title":"Text","type":"string"},"start":{"title":"Start","type":"number"},"end":{"title":"End","type":"number"}},"required":["text","start","end"],"title":"ASRSegment","type":"object"}}},"servers":[{"description":"Fish Audio API","url":"https://api.fish.audio"}]}