Close Menu
    Facebook X (Twitter) Instagram
    Articles Stock
    • Home
    • Technology
    • AI
    • Pages
      • About ArticlesStock — AI & Technology Journalist
      • Contact us
      • Disclaimer For Articles Stock
      • Privacy Policy
      • Terms and Conditions
    Facebook X (Twitter) Instagram
    Articles Stock
    AI

    A Coding Implementation on Microsoft’s Phi-4-Mini for Quantized Inference Reasoning Device Use RAG and LoRA Positive-Tuning

    Naveed AhmadBy Naveed Ahmad21/04/2026Updated:21/04/2026No Comments2 Mins Read
    blog 58


    import subprocess, sys, os, shutil, glob
    
    
    def pip_install(args):
       subprocess.run([sys.executable, "-m", "pip", "install", "-q", *args],
                      examine=True)
    
    
    pip_install(["huggingface_hub>=0.26,<1.0"])
    
    
    pip_install([
       "-U",
       "transformers>=4.49,<4.57",
       "accelerate>=0.33.0",
       "bitsandbytes>=0.43.0",
       "peft>=0.11.0",
       "datasets>=2.20.0,<3.0",
       "sentence-transformers>=3.0.0,<4.0",
       "faiss-cpu",
    ])
    
    
    for p in glob.glob(os.path.expanduser(
           "~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4*")):
       shutil.rmtree(p, ignore_errors=True)
    
    
    for _m in listing(sys.modules):
       if _m.startswith(("transformers", "huggingface_hub", "tokenizers",
                         "speed up", "peft", "datasets",
                         "sentence_transformers")):
           del sys.modules[_m]
    
    
    import json, re, textwrap, warnings, torch
    warnings.filterwarnings("ignore")
    
    
    from transformers import (
       AutoModelForCausalLM,
       AutoTokenizer,
       BitsAndBytesConfig,
       TextStreamer,
       TrainingArguments,
       Coach,
       DataCollatorForLanguageModeling,
    )
    import transformers
    print(f"Utilizing transformers {transformers.__version__}")
    
    
    PHI_MODEL_ID = "microsoft/Phi-4-mini-instruct"
    
    
    assert torch.cuda.is_available(), (
       "No GPU detected. In Colab: Runtime > Change runtime kind > T4 GPU."
    )
    print(f"GPU detected: {torch.cuda.get_device_name(0)}")
    print(f"Loading Phi mannequin (native phi3 arch, no distant code): {PHI_MODEL_ID}n")
    
    
    bnb_cfg = BitsAndBytesConfig(
       load_in_4bit=True,
       bnb_4bit_quant_type="nf4",
       bnb_4bit_compute_dtype=torch.bfloat16,
       bnb_4bit_use_double_quant=True,
    )
    
    
    phi_tokenizer = AutoTokenizer.from_pretrained(PHI_MODEL_ID)
    if phi_tokenizer.pad_token_id is None:
       phi_tokenizer.pad_token = phi_tokenizer.eos_token
    
    
    phi_model = AutoModelForCausalLM.from_pretrained(
       PHI_MODEL_ID,
       quantization_config=bnb_cfg,
       device_map="auto",
       torch_dtype=torch.bfloat16,
    )
    phi_model.config.use_cache = True
    
    
    print(f"n✓ Phi-4-mini loaded in 4-bit. "
         f"GPU reminiscence: {torch.cuda.memory_allocated()/1e9:.2f} GB")
    print(f"  Structure: {phi_model.config.model_type}   "
         f"(utilizing built-in {kind(phi_model).__name__})")
    print(f"  Parameters: ~{sum(p.numel() for p in phi_model.parameters())/1e9:.2f}B")
    
    
    def ask_phi(messages, *, instruments=None, max_new_tokens=512,
               temperature=0.3, stream=False):
       """Single entry level for all Phi-4-mini inference calls under."""
       prompt_ids = phi_tokenizer.apply_chat_template(
           messages,
           instruments=instruments,
           add_generation_prompt=True,
           return_tensors="pt",
       ).to(phi_model.gadget)
    
    
       streamer = (TextStreamer(phi_tokenizer, skip_prompt=True,
                                skip_special_tokens=True)
                   if stream else None)
    
    
       with torch.inference_mode():
           out = phi_model.generate(
               prompt_ids,
               max_new_tokens=max_new_tokens,
               do_sample=temperature > 0,
               temperature=max(temperature, 1e-5),
               top_p=0.9,
               pad_token_id=phi_tokenizer.pad_token_id,
               eos_token_id=phi_tokenizer.eos_token_id,
               streamer=streamer,
           )
       return phi_tokenizer.decode(
           out[0][prompt_ids.shape[1]:], skip_special_tokens=True
       ).strip()
    
    
    def banner(title):
       print("n" + "=" * 78 + f"n  {title}n" + "=" * 78)



    Source link

    Naveed Ahmad

    Naveed Ahmad is a technology journalist and AI writer at ArticlesStock, covering artificial intelligence, machine learning, and emerging tech policy. Read his latest articles.

    Related Posts

    It is not only one factor — it is one other factor

    21/04/2026

    Anthropic takes $5B from Amazon and pledges $100B in cloud spending in return

    21/04/2026

    Moonshot AI Releases Kimi K2.6 with Lengthy-Horizon Coding, Agent Swarm Scaling to 300 Sub-Brokers and 4,000 Coordinated Steps

    21/04/2026
    Leave A Reply Cancel Reply

    Categories
    • AI
    Recent Comments
      Facebook X (Twitter) Instagram Pinterest
      © 2026 ThemeSphere. Designed by ThemeSphere.

      Type above and press Enter to search. Press Esc to cancel.