Skip to content Skip to sidebar Skip to footer

Parsing Instagram With Java Jsoup Not Give Elements Gives Source

I'm trying to get reels video URL with jsoup using java on Android Studio. I want to get Elements in inspect but code returns page source. I use jsoup in other projects on differen

Solution 1:

If you check the source of the page (inspect the video element) you'll find:

<videoclass="tWeCl"playsinline=""poster="https://instagram.flhr4-2.fna.fbcdn.net/v/t51.2885-15/e35/117157253_120443486171759_7332785595039685871_n.jpg?_nc_ht=instagram.flhr4-2.fna.fbcdn.net&amp;_nc_cat=111&amp;_nc_ohc=aX7rVh9IbGoAX_lj74j&amp;oh=ba74c5c8ad97ba14c35710addd523dfd&amp;oe=5F363C59"preload="none"type="video/mp4"src="https://instagram.flhr4-2.fna.fbcdn.net/v/t50.2886-16/117284962_313567919762486_3343704909021624596_n.mp4?_nc_ht=instagram.flhr4-2.fna.fbcdn.net&amp;_nc_cat=102&amp;_nc_ohc=3wvoN4vNzkUAX_DLFTR&amp;oe=5F3659EF&amp;oh=7a38d593469a99239a7cb07050cc47f2"></video>

If you then search the html for the mp4 url you'll find it in one of the javascript html tags... it is delivered as a json value. So by breaking up the javascript text on the " = " and taking the latter half, you get the raw json which can then be parsed for the "video_url" using JayWay's JsonPath.read method.

It would seem the video tag is therefore generated in the html by the javascript as it doesn't appear possible to filter the html for any <video> elements.

import com.jayway.jsonpath.JsonPath;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

publicclassInstagram {

    privatefinal String url;

    publicInstagram(String url) {
        this.url = url;
    }

    publicvoidstart() {
        Documentdoc= getHtmlPage(url);
        ElementsvideoElement= getScriptElementContainingVideoUrl(doc);

        List<String> relevantTagWithMp4Url = getSingleScriptElementWithVideoUrl(videoElement);
        StringscriptInnerHtml= relevantTagWithMp4Url.get(0);

        System.out.println("Video Url: " + getVideoUrl(scriptInnerHtml));
    }

    private List<String> getSingleScriptElementWithVideoUrl(Elements scriptElements) {
        List<String> relevantTagWithMp4Url = newArrayList<>();

        for (Element element : scriptElements) {
            if (element.data().contains("mp4")) {
                relevantTagWithMp4Url.add(element.data());
            }
        }

        return relevantTagWithMp4Url;
    }

    private Elements getScriptElementContainingVideoUrl(Document doc) {
        return doc.select("script");
    }

    private String getVideoUrl(String videoElement) {
        StringjsonResponse= videoElement.split(" = ")[1];
        // $.. is equivalent to $.[*] - (a wild card matcher) - you may need to play with this
        List<String> videoUrl = JsonPath.read(jsonResponse, "$..video_url");
        return videoUrl.get(0);
    }

    private Document getHtmlPage(String url) {
        try {
            return Jsoup.connect(url).get();
        } catch (IOException e) {
            e.printStackTrace();
        }
        returnnull;
    }


    publicstaticvoidmain(String[] args) {
        newInstagram("https://www.instagram.com/reel/CDok74FJzHp/?igshid=cam8ylb7okl7").start();
    }
}

Post a Comment for "Parsing Instagram With Java Jsoup Not Give Elements Gives Source"