Parsing Instagram With Java Jsoup Not Give Elements Gives Source
I'm trying to get reels video URL with jsoup using java on Android Studio. I want to get Elements in inspect but code returns page source. I use jsoup in other projects on differen
Solution 1:
If you check the source of the page (inspect the video element) you'll find:
<videoclass="tWeCl"playsinline=""poster="https://instagram.flhr4-2.fna.fbcdn.net/v/t51.2885-15/e35/117157253_120443486171759_7332785595039685871_n.jpg?_nc_ht=instagram.flhr4-2.fna.fbcdn.net&_nc_cat=111&_nc_ohc=aX7rVh9IbGoAX_lj74j&oh=ba74c5c8ad97ba14c35710addd523dfd&oe=5F363C59"preload="none"type="video/mp4"src="https://instagram.flhr4-2.fna.fbcdn.net/v/t50.2886-16/117284962_313567919762486_3343704909021624596_n.mp4?_nc_ht=instagram.flhr4-2.fna.fbcdn.net&_nc_cat=102&_nc_ohc=3wvoN4vNzkUAX_DLFTR&oe=5F3659EF&oh=7a38d593469a99239a7cb07050cc47f2"></video>
If you then search the html for the mp4 url you'll find it in one of the javascript html tags... it is delivered as a json value. So by breaking up the javascript text on the " = "
and taking the latter half, you get the raw json which can then be parsed for the "video_url"
using JayWay's JsonPath.read
method.
It would seem the video tag is therefore generated in the html by the javascript as it doesn't appear possible to filter the html for any <video
> elements.
import com.jayway.jsonpath.JsonPath;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
publicclassInstagram {
privatefinal String url;
publicInstagram(String url) {
this.url = url;
}
publicvoidstart() {
Documentdoc= getHtmlPage(url);
ElementsvideoElement= getScriptElementContainingVideoUrl(doc);
List<String> relevantTagWithMp4Url = getSingleScriptElementWithVideoUrl(videoElement);
StringscriptInnerHtml= relevantTagWithMp4Url.get(0);
System.out.println("Video Url: " + getVideoUrl(scriptInnerHtml));
}
private List<String> getSingleScriptElementWithVideoUrl(Elements scriptElements) {
List<String> relevantTagWithMp4Url = newArrayList<>();
for (Element element : scriptElements) {
if (element.data().contains("mp4")) {
relevantTagWithMp4Url.add(element.data());
}
}
return relevantTagWithMp4Url;
}
private Elements getScriptElementContainingVideoUrl(Document doc) {
return doc.select("script");
}
private String getVideoUrl(String videoElement) {
StringjsonResponse= videoElement.split(" = ")[1];
// $.. is equivalent to $.[*] - (a wild card matcher) - you may need to play with this
List<String> videoUrl = JsonPath.read(jsonResponse, "$..video_url");
return videoUrl.get(0);
}
private Document getHtmlPage(String url) {
try {
return Jsoup.connect(url).get();
} catch (IOException e) {
e.printStackTrace();
}
returnnull;
}
publicstaticvoidmain(String[] args) {
newInstagram("https://www.instagram.com/reel/CDok74FJzHp/?igshid=cam8ylb7okl7").start();
}
}
Post a Comment for "Parsing Instagram With Java Jsoup Not Give Elements Gives Source"