superdiff
/

superdiff-sdxl-v1-0

@@ -38,8 +38,10 @@ def int_beta(t):
     t :
         t
     """
-  a, b = get_scaled_coeffs()
-  return ((a+b*t)**3-a**3)/(3*b)
 def sigma(t):
     """sigma.
@@ -48,7 +50,9 @@ def sigma(t):
     t :
         t
     """
-  return torch.expm1(int_beta(t))**0.5
 def sigma_orig(t):
     """sigma_orig.
@@ -57,13 +61,13 @@ def sigma_orig(t):
     t :
         t
     """
-  return (-torch.expm1(-int_beta(t)))**0.5
 class SuperDiffSDXLPipeline(DiffusionPipeline, ConfigMixin):
     """SuperDiffSDXLPipeline."""
     def __init__(self, unet: Callable, vae: Callable, text_encoder: Callable, text_encoder_2: Callable, tokenizer: Callable, tokenizer_2: Callable) -> None:
         """__init__.
         Parameters
@@ -87,16 +91,16 @@ class SuperDiffSDXLPipeline(DiffusionPipeline, ConfigMixin):
         """
         super().__init__()
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        dtype=torch.float16
         vae.to(device)
         unet.to(device)
         text_encoder.to(device)
         text_encoder_2.to(device)
-        self.register_modules(unet=unet,
-                              vae=vae,
                               text_encoder=text_encoder,
                               text_encoder_2=text_encoder_2,
                               tokenizer=tokenizer,
@@ -119,34 +123,50 @@ class SuperDiffSDXLPipeline(DiffusionPipeline, ConfigMixin):
         width :
             width
         """
-        text_input = self.tokenizer(prompt_o* batch_size, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, return_tensors="pt")
-        text_input_2 = self.tokenizer_2(prompt_o* batch_size, padding="max_length", max_length=self.tokenizer_2.model_max_length, truncation=True, return_tensors="pt")
         with torch.no_grad():
-            text_embeddings = self.text_encoder(text_input.input_ids.to(self.device), output_hidden_states=True)
-            text_embeddings_2 = self.text_encoder_2(text_input_2.input_ids.to(self.device), output_hidden_states=True)
-        prompt_embeds_o = torch.concat((text_embeddings.hidden_states[-2], text_embeddings_2.hidden_states[-2]), dim=-1)
         pooled_prompt_embeds_o = text_embeddings_2[0]
         negative_prompt_embeds = torch.zeros_like(prompt_embeds_o)
-        negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds_o)
-        text_input = self.tokenizer(prompt_b* batch_size, padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, return_tensors="pt")
-        text_input_2 = self.tokenizer_2(prompt_b* batch_size, padding="max_length", max_length=self.tokenizer_2.model_max_length, truncation=True, return_tensors="pt")
         with torch.no_grad():
-            text_embeddings = self.text_encoder(text_input.input_ids.to(self.device), output_hidden_states=True)
-            text_embeddings_2 = self.text_encoder_2(text_input_2.input_ids.to(self.device), output_hidden_states=True)
-        prompt_embeds_b = torch.concat((text_embeddings.hidden_states[-2], text_embeddings_2.hidden_states[-2]), dim=-1)
         pooled_prompt_embeds_b = text_embeddings_2[0]
-        add_time_ids_o = torch.tensor([(height,width,0,0,height,width)])
-        add_time_ids_b = torch.tensor([(height,width,0,0,height,width)])
-        negative_add_time_ids = torch.tensor([(height,width,0,0,height,width)])
-        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds_o, prompt_embeds_b], dim=0)
-        add_text_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds_o, pooled_prompt_embeds_b], dim=0)
-        add_time_ids = torch.cat([negative_add_time_ids, add_time_ids_o, add_time_ids_b], dim=0)
         prompt_embeds = prompt_embeds.to(self.device)
         add_text_embeds = add_text_embeds.to(self.device)
         add_time_ids = add_time_ids.to(self.device).repeat(batch_size, 1)
-        added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
         return prompt_embeds, added_cond_kwargs
     @torch.no_grad
@@ -217,6 +237,15 @@ class SuperDiffSDXLPipeline(DiffusionPipeline, ConfigMixin):
         def v(_x, _e): return self.model(
             """v.
             Parameters
             ----------
             _x :
@@ -280,8 +309,10 @@ class SuperDiffSDXLPipeline(DiffusionPipeline, ConfigMixin):
             self.seed
         )  # Seed generator to create the initial latent noise
-        latents = torch.randn((batch_size, self.unet.in_channels, height // 8, width // 8), generator=self.generator, dtype=self.dtype, device=self.device,)
-        prompt_embeds, added_cond_kwargs = self.prepare_prompt_input(prompt_1, prompt_2, batch_size, height, width)
         return {
             "latents": latents,
@@ -317,18 +348,26 @@ class SuperDiffSDXLPipeline(DiffusionPipeline, ConfigMixin):
                 dsigma = sigma(t-dt) - sigma_t
                 latent_model_input /= (sigma_t**2+1)**0.5
                 with torch.no_grad():
-                    noise_pred = self.unet(latent_model_input, t*train_number_steps, encoder_hidden_states=prompt_embeds, added_cond_kwargs=added_cond_kwargs, return_dict=False)[0]
-                noise_pred_uncond, noise_pred_text_o, noise_pred_text_b = noise_pred.chunk(3)
                 # noise = torch.sqrt(2*torch.abs(dsigma)*sigma_t)*torch.randn_like(latents)
-                noise = torch.sqrt(2*torch.abs(dsigma)*sigma_t)*torch.empty_like(latents, device=self.device).normal_(generator=self.generator)
-                dx_ind = 2*dsigma*(noise_pred_uncond + self.guidance_scale*(noise_pred_text_b - noise_pred_uncond)) + noise
-                kappa = (torch.abs(dsigma)*(noise_pred_text_b-noise_pred_text_o)*(noise_pred_text_b+noise_pred_text_o)).sum((1,2,3))-(dx_ind*((noise_pred_text_o-noise_pred_text_b))).sum((1,2,3))
-                kappa /= 2*dsigma*self.guidance_scale*((noise_pred_text_o-noise_pred_text_b)**2).sum((1,2,3))
-                noise_pred = noise_pred_uncond + self.guidance_scale*((noise_pred_text_b - noise_pred_uncond) + kappa[:,None,None,None]*(noise_pred_text_o-noise_pred_text_b))
                 if i < self.num_inference_steps - 1:
                     latents += 2*dsigma * noise_pred + noise
                 else:
@@ -354,7 +393,7 @@ class SuperDiffSDXLPipeline(DiffusionPipeline, ConfigMixin):
         latents = latents.to(torch.float32)
         with torch.no_grad():
             image = self.vae.decode(latents, return_dict=False)[0]
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
         images = (image * 255).round().astype("uint8")
@@ -389,7 +428,7 @@ class SuperDiffSDXLPipeline(DiffusionPipeline, ConfigMixin):
             height
         width : int
             width
-        guidance_scale : int
             guidance_scale
         Returns

     t :
         t
     """
+    a, b = get_scaled_coeffs()
+    return ((a+b*t)**3-a**3)/(3*b)
 def sigma(t):
     """sigma.
     t :
         t
     """
+    return torch.expm1(int_beta(t))**0.5
 def sigma_orig(t):
     """sigma_orig.
     t :
         t
     """
+    return (-torch.expm1(-int_beta(t)))**0.5
 class SuperDiffSDXLPipeline(DiffusionPipeline, ConfigMixin):
     """SuperDiffSDXLPipeline."""
     def __init__(self, unet: Callable, vae: Callable, text_encoder: Callable, text_encoder_2: Callable, tokenizer: Callable, tokenizer_2: Callable) -> None:
         """__init__.
         Parameters
         """
         super().__init__()
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        dtype = torch.float16
         vae.to(device)
         unet.to(device)
         text_encoder.to(device)
         text_encoder_2.to(device)
+        self.register_modules(unet=unet,
+                              vae=vae,
                               text_encoder=text_encoder,
                               text_encoder_2=text_encoder_2,
                               tokenizer=tokenizer,
         width :
             width
         """
+        text_input = self.tokenizer(prompt_o * batch_size, padding="max_length",
+                                    max_length=self.tokenizer.model_max_length, truncation=True, return_tensors="pt")
+        text_input_2 = self.tokenizer_2(prompt_o * batch_size, padding="max_length",
+                                        max_length=self.tokenizer_2.model_max_length, truncation=True, return_tensors="pt")
         with torch.no_grad():
+            text_embeddings = self.text_encoder(
+                text_input.input_ids.to(self.device), output_hidden_states=True)
+            text_embeddings_2 = self.text_encoder_2(
+                text_input_2.input_ids.to(self.device), output_hidden_states=True)
+        prompt_embeds_o = torch.concat(
+            (text_embeddings.hidden_states[-2], text_embeddings_2.hidden_states[-2]), dim=-1)
         pooled_prompt_embeds_o = text_embeddings_2[0]
         negative_prompt_embeds = torch.zeros_like(prompt_embeds_o)
+        negative_pooled_prompt_embeds = torch.zeros_like(
+            pooled_prompt_embeds_o)
+        text_input = self.tokenizer(prompt_b * batch_size, padding="max_length",
+                                    max_length=self.tokenizer.model_max_length, truncation=True, return_tensors="pt")
+        text_input_2 = self.tokenizer_2(prompt_b * batch_size, padding="max_length",
+                                        max_length=self.tokenizer_2.model_max_length, truncation=True, return_tensors="pt")
         with torch.no_grad():
+            text_embeddings = self.text_encoder(
+                text_input.input_ids.to(self.device), output_hidden_states=True)
+            text_embeddings_2 = self.text_encoder_2(
+                text_input_2.input_ids.to(self.device), output_hidden_states=True)
+        prompt_embeds_b = torch.concat(
+            (text_embeddings.hidden_states[-2], text_embeddings_2.hidden_states[-2]), dim=-1)
         pooled_prompt_embeds_b = text_embeddings_2[0]
+        add_time_ids_o = torch.tensor([(height, width, 0, 0, height, width)])
+        add_time_ids_b = torch.tensor([(height, width, 0, 0, height, width)])
+        negative_add_time_ids = torch.tensor(
+            [(height, width, 0, 0, height, width)])
+        prompt_embeds = torch.cat(
+            [negative_prompt_embeds, prompt_embeds_o, prompt_embeds_b], dim=0)
+        add_text_embeds = torch.cat(
+            [negative_pooled_prompt_embeds, pooled_prompt_embeds_o, pooled_prompt_embeds_b], dim=0)
+        add_time_ids = torch.cat(
+            [negative_add_time_ids, add_time_ids_o, add_time_ids_b], dim=0)
         prompt_embeds = prompt_embeds.to(self.device)
         add_text_embeds = add_text_embeds.to(self.device)
         add_time_ids = add_time_ids.to(self.device).repeat(batch_size, 1)
+        added_cond_kwargs = {
+            "text_embeds": add_text_embeds, "time_ids": add_time_ids}
         return prompt_embeds, added_cond_kwargs
     @torch.no_grad
         def v(_x, _e): return self.model(
             """v.
+            Parameters
+            ----------
+            _x :
+                _x
+            _e :
+                _e
+            """
+            """v.
             Parameters
             ----------
             _x :
             self.seed
         )  # Seed generator to create the initial latent noise
+        latents = torch.randn((batch_size, self.unet.in_channels, height // 8, width // 8),
+                              generator=self.generator, dtype=self.dtype, device=self.device,)
+        prompt_embeds, added_cond_kwargs = self.prepare_prompt_input(
+            prompt_1, prompt_2, batch_size, height, width)
         return {
             "latents": latents,
                 dsigma = sigma(t-dt) - sigma_t
                 latent_model_input /= (sigma_t**2+1)**0.5
                 with torch.no_grad():
+                    noise_pred = self.unet(latent_model_input, t*train_number_steps, encoder_hidden_states=prompt_embeds,
+                                           added_cond_kwargs=added_cond_kwargs, return_dict=False)[0]
+                noise_pred_uncond, noise_pred_text_o, noise_pred_text_b = noise_pred.chunk(
+                    3)
                 # noise = torch.sqrt(2*torch.abs(dsigma)*sigma_t)*torch.randn_like(latents)
+                noise = torch.sqrt(2*torch.abs(dsigma)*sigma_t)*torch.empty_like(
+                    latents, device=self.device).normal_(generator=self.generator)
+                dx_ind = 2*dsigma*(noise_pred_uncond + self.guidance_scale *
+                                   (noise_pred_text_b - noise_pred_uncond)) + noise
+                kappa = (torch.abs(dsigma)*(noise_pred_text_b-noise_pred_text_o)*(noise_pred_text_b+noise_pred_text_o)
+                         ).sum((1, 2, 3))-(dx_ind*((noise_pred_text_o-noise_pred_text_b))).sum((1, 2, 3))
+                kappa /= 2*dsigma*self.guidance_scale * \
+                    ((noise_pred_text_o-noise_pred_text_b)**2).sum((1, 2, 3))
+                noise_pred = noise_pred_uncond + self.guidance_scale * \
+                    ((noise_pred_text_b - noise_pred_uncond) +
+                     kappa[:, None, None, None]*(noise_pred_text_o-noise_pred_text_b))
                 if i < self.num_inference_steps - 1:
                     latents += 2*dsigma * noise_pred + noise
                 else:
         latents = latents.to(torch.float32)
         with torch.no_grad():
             image = self.vae.decode(latents, return_dict=False)[0]
         image = (image / 2 + 0.5).clamp(0, 1)
         image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
         images = (image * 255).round().astype("uint8")
             height
         width : int
             width
+        guidance_scale : float
             guidance_scale
         Returns